Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
591 changes: 591 additions & 0 deletions src/maxtext/checkpoint_conversion/compare_linen_nnx_checkpoint.py

Large diffs are not rendered by default.

439 changes: 439 additions & 0 deletions src/maxtext/checkpoint_conversion/linen_nnx_converter.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"""

import argparse
import functools
import gc
import os
import sys
Expand Down Expand Up @@ -87,7 +88,10 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
mesh = Mesh(devices_array, cfg.mesh_axes)

quant = quantizations.configure_quantization(cfg)
model = transformer_as_linen(cfg, mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
if cfg.pure_nnx:
raise NotImplementedError("Pure NNX support has not been implemented yet.")
else:
model = transformer_as_linen(cfg, mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(cfg)
tx = optimizers.get_optimizer(cfg, learning_rate_schedule)

Expand All @@ -98,7 +102,12 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
cfg.checkpoint_period,
)

state, _, _, _ = maxtext_utils.setup_training_state(model, None, tx, cfg, init_rng, mesh, checkpoint_manager)
if cfg.pure_nnx:
# NNX has a different function to init the training state.
raise NotImplementedError("Pure NNX support has not been implemented yet.")
else:
init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, cfg, True, init_rng)
state, _, _, _ = maxtext_utils.setup_training_state(None, cfg, mesh, checkpoint_manager, init_state_fn)
max_logging.log("start")
max_utils.print_mem_stats("After params initialized")

Expand Down
45 changes: 38 additions & 7 deletions src/maxtext/common/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from absl import flags
import datetime
from etils import epath
from flax import nnx
from flax.training import train_state
import jax
from maxtext.utils.globals import DEFAULT_OCDBT_TARGET_DATA_FILE_SIZE
Expand Down Expand Up @@ -522,7 +523,7 @@ def load_state_if_possible(
load_parameters_from_path: str,
load_full_state_from_path: str,
checkpoint_storage_concurrent_gb: int,
abstract_unboxed_pre_state: train_state.TrainState,
abstract_unboxed_pre_state: train_state.TrainState | nnx.State,
enable_single_replica_ckpt_restoring: bool | None = False,
dataset_type: str | None = "tfds",
step: int = -1, # -1 means latest
Expand Down Expand Up @@ -590,8 +591,13 @@ def map_to_pspec(data):
)
ocp.type_handlers.register_type_handler(jax.Array, array_handler, override=True)

restore_args = jax.tree_util.tree_map(map_to_pspec, abstract_unboxed_pre_state)
checkpoint_args = ocp.args.PyTreeRestore(item=abstract_unboxed_pre_state, restore_args=restore_args)
# Convert nnx.State to pure dict to match how checkpoints are saved for NNX
restore_target = abstract_unboxed_pre_state
if isinstance(abstract_unboxed_pre_state, nnx.State):
restore_target = abstract_unboxed_pre_state.to_pure_dict()

restore_args = jax.tree_util.tree_map(map_to_pspec, restore_target)
checkpoint_args = ocp.args.PyTreeRestore(item=restore_target, restore_args=restore_args)

match (checkpoint_manager, dataset_type, data_iterator):
# Case 1: Matches if 'checkpoint_manager' is an instance of either EmergencyCheckpointManager
Expand Down Expand Up @@ -626,9 +632,14 @@ def map_to_pspec(data):
return (checkpoint_manager.restore(step, args=Composite(items=checkpoint_args)), None)

if load_parameters_from_path != "":
if isinstance(abstract_unboxed_pre_state, nnx.State):
_, params, _ = nnx.split(abstract_unboxed_pre_state.model, nnx.Param, ...)
else:
params = abstract_unboxed_pre_state.params

restored_params = load_params_from_path(
load_parameters_from_path,
abstract_unboxed_pre_state.params,
params,
checkpoint_storage_concurrent_gb,
use_ocdbt=use_ocdbt,
use_zarr3=use_zarr3,
Expand Down Expand Up @@ -712,15 +723,35 @@ def save_params_to_path(checkpoint_dir, params, use_ocdbt=True, use_zarr3=True):
print(f"Quantized params checkpoint saved at: {checkpoint_dir}")


def maybe_save_checkpoint(checkpoint_manager, state, config, data_iterator, step=None):
"""Save checkpoint if checkpointing is enabled."""
def maybe_save_checkpoint(checkpoint_manager, state, config, data_iterator, step=None, force=False):
"""Save checkpoint if checkpointing is enabled.

Args:
checkpoint_manager: The checkpoint manager.
state: The training state to save.
config: The config object.
data_iterator: The data iterator.
step: The step number. If None, extracts from state (for Linen TrainState).
force: If True, force save the checkpoint regardless of checkpoint_period.
"""
if checkpoint_manager is None:
return

# Determine the effective step for saving a checkpoint.
# If 'step' is not provided, this call is for a potential final checkpoint
# and use the last completed step from the state.
actual_step = (int(state.step) - 1) if step is None else int(step)
if step is not None:
actual_step = int(step)
else:
if config.pure_nnx:
actual_step = int(state.optimizer.step) - 1
else:
# Linen TrainState has .step attribute
actual_step = int(state.step) - 1

if config.pure_nnx:
# Convert nnx.State to dict.
state = state.to_pure_dict()

# Determine if a checkpoint save should be forced, overriding the usual `config.checkpoint_period` logic.
# This occurs if this function was called:
Expand Down
9 changes: 9 additions & 0 deletions src/maxtext/common/gcloud_stub.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ def is_decoupled() -> bool: # dynamic check so setting env after initial import
return os.environ.get("DECOUPLE_GCLOUD", "").upper() == "TRUE"


def is_pure_nnx() -> bool: # dynamic check so setting env after initial import still works
"""Return True when running in pure NNX mode (PURE_NNX=TRUE env var).

Defaults to FALSE — Linen is the default test mode.
Set PURE_NNX=TRUE to opt in to NNX mode (skips linen_only tests, runs nnx_only tests).
"""
return os.environ.get("PURE_NNX", "FALSE").upper() == "TRUE"


T = TypeVar("T")


Expand Down
5 changes: 4 additions & 1 deletion src/maxtext/configs/base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,7 @@ logical_axis_rules: [
['paged_kv_head_dim_size', []],
['dense_layers', []],
['moe_layers', []],
['num_activations', []],
['engram_dim', ['tensor']],
['mhc', []],
['diloco', 'diloco'],
Expand Down Expand Up @@ -1078,7 +1079,9 @@ position_id_per_seconds: 25
subslice_shape: ""

# NNX
enable_nnx: false
enable_nnx: True
pure_nnx_decoder: True
pure_nnx: True

################################## Qwen3-Next Specific Configs ##################################
# Kernel size for the 1D convolution in the Gated Delta Net
Expand Down
4 changes: 4 additions & 0 deletions src/maxtext/configs/decoupled_base_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ eval_dataset_name: 'c4/en:3.1.0'
# Use dot_product attention to avoid GPU Pallas shared memory limits on AMD GPUs
attention: "dot_product"

# Default to Linen mode for tests; NNX is opt-in via PURE_NNX=TRUE.
pure_nnx: False
pure_nnx_decoder: False

# Avoid HLO dump overhead.
dump_hlo: false
jax_cache_dir: ""
Expand Down
2 changes: 2 additions & 0 deletions src/maxtext/configs/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,8 @@ class HardwareAndMesh(BaseModel):
enable_nnx: bool = Field(False, description="Whether to use NNX for model definition.")
optimize_mesh_for_tpu_v6e: bool = Field(False, description="Apply transformations to the mesh for TPU v6e.")
shardy: bool = Field(True, description="Whether to use shardy XLA backend.")
pure_nnx_decoder: bool = Field(False, description="Whether to enable pure NNX decoder.")
pure_nnx: bool = Field(False, description="Whether to enable pure NNX mode.")


class LayoutAndSharding(BaseModel):
Expand Down
32 changes: 26 additions & 6 deletions src/maxtext/experimental/rl/grpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,23 +546,43 @@ def setup_train_loop(
max_logging.log("Training mesh used for the workload")
num_inference_devices = config.inference_devices_per_replica * config.inference_replicas
training_devices = jax.devices()[num_inference_devices:]
model = mt.from_config(config, devices=training_devices)
if config.pure_nnx:
raise NotImplementedError("Pure NNX support has not been implemented yet.")
else:
model = mt.from_config(config, devices=training_devices)
mesh = model.mesh
max_logging.log("Inference mesh used for the workload")
inference_devices = jax.devices()[:num_inference_devices]
inference_model = mt.from_config(config_inference, devices=inference_devices)
if config_inference.pure_nnx:
raise NotImplementedError("Pure NNX support has not been implemented yet.")
else:
inference_model = mt.from_config(config_inference, devices=inference_devices)
inference_mesh = inference_model.mesh
init_rng, checkpoint_manager, learning_rate_schedule, tx = train_utils.create_training_tools(config, model, mesh)
init_rng = jax.random.PRNGKey(config.init_weights_seed)
learning_rate_schedule, tx = train_utils.create_training_optimizer(config, model)
if config.pure_nnx:
# NNX has a different function to init the training state.
raise NotImplementedError("Pure NNX support has not been implemented yet.")
else:
init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, config, True, init_rng)
checkpoint_manager = train_utils.create_checkpoint_manager(config, mesh, init_state_fn)

with maybe_record_goodput(recorder, GoodputEvent.TRAINING_PREPARATION):
data_iterator = grpo_input_pipeline.create_data_iterator(config_inference, inference_mesh)
state, _, state_mesh_shardings, data_iterator = maxtext_utils.setup_training_state(
model, data_iterator, tx, config, init_rng, mesh, checkpoint_manager
data_iterator, config, mesh, checkpoint_manager, init_state_fn
)

# create inference_state_mesh_shardings from inference_mesh
if config_inference.pure_nnx:
# NNX has a different function to init the training state.
raise NotImplementedError("Pure NNX support has not been implemented yet.")
else:
init_inference_state_fn = functools.partial(
maxtext_utils.init_initial_state, inference_model, tx, config_inference, False, init_rng
)
inference_state_mesh_shardings = maxtext_utils.get_abstract_state(
inference_model, tx, config_inference, init_rng, inference_mesh, is_training=False
config_inference, inference_mesh, init_inference_state_fn, is_training=False
)[2]
if not config.using_pipeline_parallelism:
# The vocab tensor(s) of shape [vocab, embed] (and transpose) are not sharded by stage
Expand Down Expand Up @@ -697,7 +717,7 @@ def train_loop(config, config_inference, recorder, state=None):
data_buffer = []
data_buffer_lock = threading.Lock()

start_step = get_first_step(state) # this is the start_step for training
start_step = get_first_step(model, state) # this is the start_step for training
prof = profiler.Profiler(config, offset_step=start_step)
inference_prof = profiler.Profiler(config_inference, offset_step=start_step)
data_loader = DataLoader(config_inference, inference_mesh, data_iterator, recorder)
Expand Down
21 changes: 16 additions & 5 deletions src/maxtext/inference/maxengine/maxengine.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,10 @@ def __init__(self, config: Any, devices: Any | None = None):

# Model and Optimizer definition
quant = quantizations.configure_quantization(config)
self.model = models.transformer_as_linen(config, mesh=self._mesh, quant=quant, model_mode=MODEL_MODE_PREFILL)
if config.pure_nnx:
raise NotImplementedError("Pure NNX support has not been implemented yet.")
else:
self.model = models.transformer_as_linen(config, mesh=self._mesh, quant=quant, model_mode=MODEL_MODE_PREFILL)
self.replicated_sharding = jax.sharding.NamedSharding(self._mesh, P(None))

self.abstract_params = None
Expand Down Expand Up @@ -229,17 +232,25 @@ def load_params(self, *args, params=None, rng: PRNGKeyType | None = None, **kwar
rng1, rng2, rng3 = jax.random.split(rng, 3)
if params:
print("Resharding given params")
if self.config.pure_nnx:
# NNX has a different function to init the training state.
raise NotImplementedError("Pure NNX support has not been implemented yet.")
else:
init_state_fn = functools.partial(maxtext_utils.init_initial_state, self.model, None, self.config, False, rng)
_, self.state_mesh_annotations, state_mesh_shardings = maxtext_utils.get_abstract_state(
self.model, None, self.config, rng, self._mesh, False
self.config, self._mesh, init_state_fn, False
)
# reshard given params based on shardings from config in MaxEngine
params = jax.device_put(params, state_mesh_shardings.params)
state = maxtext_utils.init_decode_state(None, params)
state = max_utils.unbox_logicallypartioned(state)
else:
state, self.state_mesh_annotations = maxtext_utils.setup_decode_state(
self.model, self.config, rng1, self._mesh, None
)
if self.config.pure_nnx:
# NNX has a different function to init the training state.
raise NotImplementedError("Pure NNX support has not been implemented yet.")
else:
init_state_fn = functools.partial(maxtext_utils.init_initial_state, self.model, None, self.config, False, rng1)
state, self.state_mesh_annotations = maxtext_utils.setup_decode_state(self.config, self._mesh, None, init_state_fn)
# pylint: disable=isinstance-second-argument-not-valid-type
self.abstract_params = jax.tree_util.tree_map(
lambda x: jax.ShapeDtypeStruct(shape=x.shape, dtype=x.dtype, sharding=x.sharding)
Expand Down
4 changes: 2 additions & 2 deletions src/maxtext/layers/attentions.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,14 +533,14 @@ def __init__(
elif self.is_qwen3_next:
self.query_norm = Qwen3NextRMSNorm(
num_features=self.config.head_dim,
eps=self.config.normalization_layer_epsilon,
epsilon=self.config.normalization_layer_epsilon,
dtype=self.config.dtype,
weight_dtype=self.config.weight_dtype,
rngs=self.rngs,
)
self.key_norm = Qwen3NextRMSNorm(
num_features=self.config.head_dim,
eps=self.config.normalization_layer_epsilon,
epsilon=self.config.normalization_layer_epsilon,
dtype=self.config.dtype,
weight_dtype=self.config.weight_dtype,
rngs=self.rngs,
Expand Down
20 changes: 5 additions & 15 deletions src/maxtext/layers/multi_token_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
import jax.numpy as jnp
from jax.sharding import Mesh
from maxtext.common.common_types import Config, MODEL_MODE_TRAIN
from maxtext.layers.nnx_decoders import NNXDecoderLayer
from maxtext.utils.globals import EPS
from maxtext.layers import nnx_wrappers
from maxtext.layers.decoders import DecoderLayer
from maxtext.layers.initializers import variable_to_logically_partitioned
from maxtext.layers.linears import DenseGeneral
Expand Down Expand Up @@ -70,7 +70,7 @@ def __init__(
config: Config,
mesh: Mesh,
layer_number: int,
transformer_layer_module: Type[DecoderLayer],
transformer_layer_module: Type[NNXDecoderLayer],
*,
rngs: nnx.Rngs,
):
Expand Down Expand Up @@ -108,22 +108,12 @@ def __init__(
rngs=rngs,
)
# Use MODEL_MODE_TRAIN for initialization; runtime model_mode is passed dynamically.
mtp_transformer_layer = transformer_layer_module(
self.transformer_layer = transformer_layer_module(
config=cfg,
mesh=mesh,
model_mode=MODEL_MODE_TRAIN,
name=f"mtp_{k}_transformer_layer",
)
self.transformer_layer = nnx_wrappers.ToNNX(mtp_transformer_layer, rngs=rngs)

# ToNNX requires explicit initialization with sample inputs for proper parameter setup.
batch_size, seq_len = max_utils.get_batch_seq_len_for_mode(config=cfg, model_mode=MODEL_MODE_TRAIN)
self.transformer_layer.lazy_init(
inputs=jnp.zeros((batch_size, seq_len, self.config.emb_dim), dtype=self.config.dtype),
decoder_segment_ids=None,
decoder_positions=jnp.zeros((batch_size, seq_len), dtype=jnp.int32),
deterministic=True,
model_mode=MODEL_MODE_TRAIN,
rngs=rngs,
)

@property
Expand Down Expand Up @@ -212,7 +202,7 @@ def __init__(
self,
config: Config,
mesh: Mesh,
transformer_layer_module: Type[DecoderLayer],
transformer_layer_module: Type[NNXDecoderLayer],
decoder: nnx.Module,
rngs: nnx.Rngs,
):
Expand Down
Loading
Loading