AI-Hypercomputer · xibinliu · Mar 5, 2026 · Mar 9, 2026 · Jan 19, 2026 · Jan 21, 2026
@@ -35,6 +35,7 @@
 """
 
 import argparse
+import functools
 import gc
 import os
 import sys
@@ -87,7 +88,10 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
   mesh = Mesh(devices_array, cfg.mesh_axes)
 
   quant = quantizations.configure_quantization(cfg)
-  model = transformer_as_linen(cfg, mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
+  if cfg.pure_nnx:
+    raise NotImplementedError("Pure NNX support has not been implemented yet.")
+  else:
+    model = transformer_as_linen(cfg, mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
   learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(cfg)
   tx = optimizers.get_optimizer(cfg, learning_rate_schedule)
 
@@ -98,7 +102,12 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
       cfg.checkpoint_period,
   )
 
-  state, _, _, _ = maxtext_utils.setup_training_state(model, None, tx, cfg, init_rng, mesh, checkpoint_manager)
+  if cfg.pure_nnx:
+    # NNX has a different function to init the training state.
+    raise NotImplementedError("Pure NNX support has not been implemented yet.")
+  else:
+    init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, cfg, True, init_rng)
+  state, _, _, _ = maxtext_utils.setup_training_state(None, cfg, mesh, checkpoint_manager, init_state_fn)
   max_logging.log("start")
   max_utils.print_mem_stats("After params initialized")
 

@@ -20,6 +20,7 @@
 from absl import flags
 import datetime
 from etils import epath
+from flax import nnx
 from flax.training import train_state
 import jax
 from maxtext.utils.globals import DEFAULT_OCDBT_TARGET_DATA_FILE_SIZE
@@ -522,7 +523,7 @@ def load_state_if_possible(
     load_parameters_from_path: str,
     load_full_state_from_path: str,
     checkpoint_storage_concurrent_gb: int,
-    abstract_unboxed_pre_state: train_state.TrainState,
+    abstract_unboxed_pre_state: train_state.TrainState | nnx.State,
     enable_single_replica_ckpt_restoring: bool | None = False,
     dataset_type: str | None = "tfds",
     step: int = -1,  # -1 means latest
@@ -590,8 +591,13 @@ def map_to_pspec(data):
         )
         ocp.type_handlers.register_type_handler(jax.Array, array_handler, override=True)
 
-      restore_args = jax.tree_util.tree_map(map_to_pspec, abstract_unboxed_pre_state)
-      checkpoint_args = ocp.args.PyTreeRestore(item=abstract_unboxed_pre_state, restore_args=restore_args)
+      # Convert nnx.State to pure dict to match how checkpoints are saved for NNX
+      restore_target = abstract_unboxed_pre_state
+      if isinstance(abstract_unboxed_pre_state, nnx.State):
+        restore_target = abstract_unboxed_pre_state.to_pure_dict()
+
+      restore_args = jax.tree_util.tree_map(map_to_pspec, restore_target)
+      checkpoint_args = ocp.args.PyTreeRestore(item=restore_target, restore_args=restore_args)
 
       match (checkpoint_manager, dataset_type, data_iterator):
         # Case 1: Matches if 'checkpoint_manager' is an instance of either EmergencyCheckpointManager
@@ -626,9 +632,14 @@ def map_to_pspec(data):
           return (checkpoint_manager.restore(step, args=Composite(items=checkpoint_args)), None)
 
   if load_parameters_from_path != "":
+    if isinstance(abstract_unboxed_pre_state, nnx.State):
+      _, params, _ = nnx.split(abstract_unboxed_pre_state.model, nnx.Param, ...)
+    else:
+      params = abstract_unboxed_pre_state.params
+
     restored_params = load_params_from_path(
         load_parameters_from_path,
-        abstract_unboxed_pre_state.params,
+        params,
         checkpoint_storage_concurrent_gb,
         use_ocdbt=use_ocdbt,
         use_zarr3=use_zarr3,
@@ -712,15 +723,35 @@ def save_params_to_path(checkpoint_dir, params, use_ocdbt=True, use_zarr3=True):
   print(f"Quantized params checkpoint saved at: {checkpoint_dir}")
 
 
-def maybe_save_checkpoint(checkpoint_manager, state, config, data_iterator, step=None):
-  """Save checkpoint if checkpointing is enabled."""
+def maybe_save_checkpoint(checkpoint_manager, state, config, data_iterator, step=None, force=False):
+  """Save checkpoint if checkpointing is enabled.
+
+  Args:
+    checkpoint_manager: The checkpoint manager.
+    state: The training state to save.
+    config: The config object.
+    data_iterator: The data iterator.
+    step: The step number. If None, extracts from state (for Linen TrainState).
+    force: If True, force save the checkpoint regardless of checkpoint_period.
+  """
   if checkpoint_manager is None:
     return
 
   # Determine the effective step for saving a checkpoint.
   # If 'step' is not provided, this call is for a potential final checkpoint
   # and use the last completed step from the state.
-  actual_step = (int(state.step) - 1) if step is None else int(step)
+  if step is not None:
+    actual_step = int(step)
+  else:
+    if config.pure_nnx:
+      actual_step = int(state.optimizer.step) - 1
+    else:
+      # Linen TrainState has .step attribute
+      actual_step = int(state.step) - 1
+
+  if config.pure_nnx:
+    # Convert nnx.State to dict.
+    state = state.to_pure_dict()
 
   # Determine if a checkpoint save should be forced, overriding the usual `config.checkpoint_period` logic.
   # This occurs if this function was called:

@@ -43,6 +43,15 @@ def is_decoupled() -> bool:  # dynamic check so setting env after initial import
   return os.environ.get("DECOUPLE_GCLOUD", "").upper() == "TRUE"
 
 
+def is_pure_nnx() -> bool:  # dynamic check so setting env after initial import still works
+  """Return True when running in pure NNX mode (PURE_NNX=TRUE env var).
+
+  Defaults to FALSE — Linen is the default test mode.
+  Set PURE_NNX=TRUE to opt in to NNX mode (skips linen_only tests, runs nnx_only tests).
+  """
+  return os.environ.get("PURE_NNX", "FALSE").upper() == "TRUE"
+
+
 T = TypeVar("T")
 
 

@@ -506,6 +506,7 @@ logical_axis_rules: [
                       ['paged_kv_head_dim_size', []],
                       ['dense_layers', []],
                       ['moe_layers', []],
+                      ['num_activations', []],
                       ['engram_dim', ['tensor']],
                       ['mhc', []],
                       ['diloco', 'diloco'],
@@ -1078,7 +1079,9 @@ position_id_per_seconds: 25
 subslice_shape: ""
 
 # NNX
-enable_nnx: false
+enable_nnx: True
+pure_nnx_decoder: True
+pure_nnx: True
 
 ################################## Qwen3-Next Specific Configs ##################################
 # Kernel size for the 1D convolution in the Gated Delta Net

@@ -30,6 +30,10 @@ eval_dataset_name: 'c4/en:3.1.0'
 # Use dot_product attention to avoid GPU Pallas shared memory limits on AMD GPUs
 attention: "dot_product"
 
+# Default to Linen mode for tests; NNX is opt-in via PURE_NNX=TRUE.
+pure_nnx: False
+pure_nnx_decoder: False
+
 # Avoid HLO dump overhead.
 dump_hlo: false
 jax_cache_dir: ""

@@ -777,6 +777,8 @@ class HardwareAndMesh(BaseModel):
   enable_nnx: bool = Field(False, description="Whether to use NNX for model definition.")
   optimize_mesh_for_tpu_v6e: bool = Field(False, description="Apply transformations to the mesh for TPU v6e.")
   shardy: bool = Field(True, description="Whether to use shardy XLA backend.")
+  pure_nnx_decoder: bool = Field(False, description="Whether to enable pure NNX decoder.")
+  pure_nnx: bool = Field(False, description="Whether to enable pure NNX mode.")
 
 
 class LayoutAndSharding(BaseModel):

@@ -546,23 +546,43 @@ def setup_train_loop(
     max_logging.log("Training mesh used for the workload")
     num_inference_devices = config.inference_devices_per_replica * config.inference_replicas
     training_devices = jax.devices()[num_inference_devices:]
-    model = mt.from_config(config, devices=training_devices)
+    if config.pure_nnx:
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      model = mt.from_config(config, devices=training_devices)
     mesh = model.mesh
     max_logging.log("Inference mesh used for the workload")
     inference_devices = jax.devices()[:num_inference_devices]
-    inference_model = mt.from_config(config_inference, devices=inference_devices)
+    if config_inference.pure_nnx:
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      inference_model = mt.from_config(config_inference, devices=inference_devices)
     inference_mesh = inference_model.mesh
-    init_rng, checkpoint_manager, learning_rate_schedule, tx = train_utils.create_training_tools(config, model, mesh)
+    init_rng = jax.random.PRNGKey(config.init_weights_seed)
+    learning_rate_schedule, tx = train_utils.create_training_optimizer(config, model)
+    if config.pure_nnx:
+      # NNX has a different function to init the training state.
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, config, True, init_rng)
+    checkpoint_manager = train_utils.create_checkpoint_manager(config, mesh, init_state_fn)
 
   with maybe_record_goodput(recorder, GoodputEvent.TRAINING_PREPARATION):
     data_iterator = grpo_input_pipeline.create_data_iterator(config_inference, inference_mesh)
     state, _, state_mesh_shardings, data_iterator = maxtext_utils.setup_training_state(
-        model, data_iterator, tx, config, init_rng, mesh, checkpoint_manager
+        data_iterator, config, mesh, checkpoint_manager, init_state_fn
     )
 
   # create inference_state_mesh_shardings from inference_mesh
+  if config_inference.pure_nnx:
+    # NNX has a different function to init the training state.
+    raise NotImplementedError("Pure NNX support has not been implemented yet.")
+  else:
+    init_inference_state_fn = functools.partial(
+        maxtext_utils.init_initial_state, inference_model, tx, config_inference, False, init_rng
+    )
   inference_state_mesh_shardings = maxtext_utils.get_abstract_state(
-      inference_model, tx, config_inference, init_rng, inference_mesh, is_training=False
+      config_inference, inference_mesh, init_inference_state_fn, is_training=False
   )[2]
   if not config.using_pipeline_parallelism:
     # The vocab tensor(s) of shape [vocab, embed] (and transpose) are not sharded by stage
@@ -697,7 +717,7 @@ def train_loop(config, config_inference, recorder, state=None):
   data_buffer = []
   data_buffer_lock = threading.Lock()
 
-  start_step = get_first_step(state)  # this is the start_step for training
+  start_step = get_first_step(model, state)  # this is the start_step for training
   prof = profiler.Profiler(config, offset_step=start_step)
   inference_prof = profiler.Profiler(config_inference, offset_step=start_step)
   data_loader = DataLoader(config_inference, inference_mesh, data_iterator, recorder)

@@ -113,7 +113,10 @@ def __init__(self, config: Any, devices: Any | None = None):
 
     # Model and Optimizer definition
     quant = quantizations.configure_quantization(config)
-    self.model = models.transformer_as_linen(config, mesh=self._mesh, quant=quant, model_mode=MODEL_MODE_PREFILL)
+    if config.pure_nnx:
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      self.model = models.transformer_as_linen(config, mesh=self._mesh, quant=quant, model_mode=MODEL_MODE_PREFILL)
     self.replicated_sharding = jax.sharding.NamedSharding(self._mesh, P(None))
 
     self.abstract_params = None
@@ -229,17 +232,25 @@ def load_params(self, *args, params=None, rng: PRNGKeyType | None = None, **kwar
     rng1, rng2, rng3 = jax.random.split(rng, 3)
     if params:
       print("Resharding given params")
+      if self.config.pure_nnx:
+        # NNX has a different function to init the training state.
+        raise NotImplementedError("Pure NNX support has not been implemented yet.")
+      else:
+        init_state_fn = functools.partial(maxtext_utils.init_initial_state, self.model, None, self.config, False, rng)
       _, self.state_mesh_annotations, state_mesh_shardings = maxtext_utils.get_abstract_state(
-          self.model, None, self.config, rng, self._mesh, False
+          self.config, self._mesh, init_state_fn, False
       )
       # reshard given params based on shardings from config in MaxEngine
       params = jax.device_put(params, state_mesh_shardings.params)
       state = maxtext_utils.init_decode_state(None, params)
       state = max_utils.unbox_logicallypartioned(state)
     else:
-      state, self.state_mesh_annotations = maxtext_utils.setup_decode_state(
-          self.model, self.config, rng1, self._mesh, None
-      )
+      if self.config.pure_nnx:
+        # NNX has a different function to init the training state.
+        raise NotImplementedError("Pure NNX support has not been implemented yet.")
+      else:
+        init_state_fn = functools.partial(maxtext_utils.init_initial_state, self.model, None, self.config, False, rng1)
+      state, self.state_mesh_annotations = maxtext_utils.setup_decode_state(self.config, self._mesh, None, init_state_fn)
     # pylint: disable=isinstance-second-argument-not-valid-type
     self.abstract_params = jax.tree_util.tree_map(
         lambda x: jax.ShapeDtypeStruct(shape=x.shape, dtype=x.dtype, sharding=x.sharding)

@@ -533,14 +533,14 @@ def __init__(
     elif self.is_qwen3_next:
       self.query_norm = Qwen3NextRMSNorm(
           num_features=self.config.head_dim,
-          eps=self.config.normalization_layer_epsilon,
+          epsilon=self.config.normalization_layer_epsilon,
           dtype=self.config.dtype,
           weight_dtype=self.config.weight_dtype,
           rngs=self.rngs,
       )
       self.key_norm = Qwen3NextRMSNorm(
           num_features=self.config.head_dim,
-          eps=self.config.normalization_layer_epsilon,
+          epsilon=self.config.normalization_layer_epsilon,
           dtype=self.config.dtype,
           weight_dtype=self.config.weight_dtype,
           rngs=self.rngs,

@@ -22,8 +22,8 @@
 import jax.numpy as jnp
 from jax.sharding import Mesh
 from maxtext.common.common_types import Config, MODEL_MODE_TRAIN
+from maxtext.layers.nnx_decoders import NNXDecoderLayer
 from maxtext.utils.globals import EPS
-from maxtext.layers import nnx_wrappers
 from maxtext.layers.decoders import DecoderLayer
 from maxtext.layers.initializers import variable_to_logically_partitioned
 from maxtext.layers.linears import DenseGeneral
@@ -70,7 +70,7 @@ def __init__(
       config: Config,
       mesh: Mesh,
       layer_number: int,
-      transformer_layer_module: Type[DecoderLayer],
+      transformer_layer_module: Type[NNXDecoderLayer],
       *,
       rngs: nnx.Rngs,
   ):
@@ -108,22 +108,12 @@ def __init__(
         rngs=rngs,
     )
     # Use MODEL_MODE_TRAIN for initialization; runtime model_mode is passed dynamically.
-    mtp_transformer_layer = transformer_layer_module(
+    self.transformer_layer = transformer_layer_module(
         config=cfg,
         mesh=mesh,
         model_mode=MODEL_MODE_TRAIN,
         name=f"mtp_{k}_transformer_layer",
-    )
-    self.transformer_layer = nnx_wrappers.ToNNX(mtp_transformer_layer, rngs=rngs)
-
-    # ToNNX requires explicit initialization with sample inputs for proper parameter setup.
-    batch_size, seq_len = max_utils.get_batch_seq_len_for_mode(config=cfg, model_mode=MODEL_MODE_TRAIN)
-    self.transformer_layer.lazy_init(
-        inputs=jnp.zeros((batch_size, seq_len, self.config.emb_dim), dtype=self.config.dtype),
-        decoder_segment_ids=None,
-        decoder_positions=jnp.zeros((batch_size, seq_len), dtype=jnp.int32),
-        deterministic=True,
-        model_mode=MODEL_MODE_TRAIN,
+        rngs=rngs,
     )
 
   @property
@@ -212,7 +202,7 @@ def __init__(
       self,
       config: Config,
       mesh: Mesh,
-      transformer_layer_module: Type[DecoderLayer],
+      transformer_layer_module: Type[NNXDecoderLayer],
       decoder: nnx.Module,
       rngs: nnx.Rngs,
   ):