AI-Hypercomputer
diff --git a/‎src/maxtext/checkpoint_conversion/standalone_scripts/convert_gpt3_ckpt_from_paxml.py‎
Lines changed: 11 additions & 2 deletions b/‎src/maxtext/checkpoint_conversion/standalone_scripts/convert_gpt3_ckpt_from_paxml.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 1 addition & 0 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxtext/experimental/rl/grpo_trainer.py‎
Lines changed: 26 additions & 6 deletions b/‎src/maxtext/experimental/rl/grpo_trainer.py‎
Lines changed: 26 additions & 6 deletions
diff --git a/‎src/maxtext/inference/maxengine/maxengine.py‎
Lines changed: 16 additions & 5 deletions b/‎src/maxtext/inference/maxengine/maxengine.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎src/maxtext/trainers/pre_train/train_compile.py‎
Lines changed: 50 additions & 14 deletions b/‎src/maxtext/trainers/pre_train/train_compile.py‎
Lines changed: 50 additions & 14 deletions
diff --git a/‎src/maxtext/utils/generate_param_only_checkpoint.py‎
Lines changed: 15 additions & 5 deletions b/‎src/maxtext/utils/generate_param_only_checkpoint.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎src/maxtext/utils/layerwise_quantization.py‎
Lines changed: 14 additions & 6 deletions b/‎src/maxtext/utils/layerwise_quantization.py‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎src/maxtext/utils/lora_utils.py‎
Lines changed: 7 additions & 1 deletion b/‎src/maxtext/utils/lora_utils.py‎
Lines changed: 7 additions & 1 deletion
@@ -35,6 +35,7 @@
 """
 
 import argparse
+import functools
 import gc
 import os
 import sys
@@ -87,7 +88,10 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
   mesh = Mesh(devices_array, cfg.mesh_axes)
 
   quant = quantizations.configure_quantization(cfg)
-  model = transformer_as_linen(cfg, mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
+  if cfg.pure_nnx:
+    raise NotImplementedError("Pure NNX support has not been implemented yet.")
+  else:
+    model = transformer_as_linen(cfg, mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
   learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(cfg)
   tx = optimizers.get_optimizer(cfg, learning_rate_schedule)
 
@@ -98,7 +102,12 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
       cfg.checkpoint_period,
   )
 
-  state, _, _, _ = maxtext_utils.setup_training_state(model, None, tx, cfg, init_rng, mesh, checkpoint_manager)
+  if cfg.pure_nnx:
+    # NNX has a different function to init the training state.
+    raise NotImplementedError("Pure NNX support has not been implemented yet.")
+  else:
+    init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, cfg, True, init_rng)
+  state, _, _, _ = maxtext_utils.setup_training_state(None, cfg, mesh, checkpoint_manager, init_state_fn)
   max_logging.log("start")
   max_utils.print_mem_stats("After params initialized")
 
 
@@ -1088,6 +1088,7 @@ subslice_shape: ""
 # NNX
 enable_nnx: True
 pure_nnx_decoder: True
+pure_nnx: True
 
 ################################## Qwen3-Next Specific Configs ##################################
 # Kernel size for the 1D convolution in the Gated Delta Net
 
@@ -786,6 +786,7 @@ class HardwareAndMesh(BaseModel):
   optimize_mesh_for_tpu_v6e: bool = Field(False, description="Apply transformations to the mesh for TPU v6e.")
   shardy: bool = Field(True, description="Whether to use shardy XLA backend.")
   pure_nnx_decoder: bool = Field(False, description="Whether to enable pure NNX decoder.")
+  pure_nnx: bool = Field(False, description="Whether to enable pure NNX mode.")
 
 
 class LayoutAndSharding(BaseModel):
 
@@ -546,23 +546,43 @@ def setup_train_loop(
     max_logging.log("Training mesh used for the workload")
     num_inference_devices = config.inference_devices_per_replica * config.inference_replicas
     training_devices = jax.devices()[num_inference_devices:]
-    model = mt.from_config(config, devices=training_devices)
+    if config.pure_nnx:
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      model = mt.from_config(config, devices=training_devices)
     mesh = model.mesh
     max_logging.log("Inference mesh used for the workload")
     inference_devices = jax.devices()[:num_inference_devices]
-    inference_model = mt.from_config(config_inference, devices=inference_devices)
+    if config_inference.pure_nnx:
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      inference_model = mt.from_config(config_inference, devices=inference_devices)
     inference_mesh = inference_model.mesh
-    init_rng, checkpoint_manager, learning_rate_schedule, tx = train_utils.create_training_tools(config, model, mesh)
+    init_rng = jax.random.PRNGKey(config.init_weights_seed)
+    learning_rate_schedule, tx = train_utils.create_training_optimizer(config, model)
+    if config.pure_nnx:
+      # NNX has a different function to init the training state.
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, config, True, init_rng)
+    checkpoint_manager = train_utils.create_checkpoint_manager(config, mesh, init_state_fn)
 
   with maybe_record_goodput(recorder, GoodputEvent.TRAINING_PREPARATION):
     data_iterator = grpo_input_pipeline.create_data_iterator(config_inference, inference_mesh)
     state, _, state_mesh_shardings, data_iterator = maxtext_utils.setup_training_state(
-        model, data_iterator, tx, config, init_rng, mesh, checkpoint_manager
+        data_iterator, config, mesh, checkpoint_manager, init_state_fn
     )
 
   # create inference_state_mesh_shardings from inference_mesh
+  if config_inference.pure_nnx:
+    # NNX has a different function to init the training state.
+    raise NotImplementedError("Pure NNX support has not been implemented yet.")
+  else:
+    init_inference_state_fn = functools.partial(
+        maxtext_utils.init_initial_state, inference_model, tx, config_inference, False, init_rng
+    )
   inference_state_mesh_shardings = maxtext_utils.get_abstract_state(
-      inference_model, tx, config_inference, init_rng, inference_mesh, is_training=False
+      config_inference, inference_mesh, init_inference_state_fn, is_training=False
   )[2]
   if not config.using_pipeline_parallelism:
     # The vocab tensor(s) of shape [vocab, embed] (and transpose) are not sharded by stage
@@ -697,7 +717,7 @@ def train_loop(config, config_inference, recorder, state=None):
   data_buffer = []
   data_buffer_lock = threading.Lock()
 
-  start_step = get_first_step(state)  # this is the start_step for training
+  start_step = get_first_step(model, state)  # this is the start_step for training
   prof = profiler.Profiler(config, offset_step=start_step)
   inference_prof = profiler.Profiler(config_inference, offset_step=start_step)
   data_loader = DataLoader(config_inference, inference_mesh, data_iterator, recorder)
 
@@ -113,7 +113,10 @@ def __init__(self, config: Any, devices: Any | None = None):
 
     # Model and Optimizer definition
     quant = quantizations.configure_quantization(config)
-    self.model = models.transformer_as_linen(config, mesh=self._mesh, quant=quant, model_mode=MODEL_MODE_PREFILL)
+    if config.pure_nnx:
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      self.model = models.transformer_as_linen(config, mesh=self._mesh, quant=quant, model_mode=MODEL_MODE_PREFILL)
     self.replicated_sharding = jax.sharding.NamedSharding(self._mesh, P(None))
 
     self.abstract_params = None
@@ -229,17 +232,25 @@ def load_params(self, *args, params=None, rng: PRNGKeyType | None = None, **kwar
     rng1, rng2, rng3 = jax.random.split(rng, 3)
     if params:
       print("Resharding given params")
+      if self.config.pure_nnx:
+        # NNX has a different function to init the training state.
+        raise NotImplementedError("Pure NNX support has not been implemented yet.")
+      else:
+        init_state_fn = functools.partial(maxtext_utils.init_initial_state, self.model, None, self.config, False, rng)
       _, self.state_mesh_annotations, state_mesh_shardings = maxtext_utils.get_abstract_state(
-          self.model, None, self.config, rng, self._mesh, False
+          self.config, self._mesh, init_state_fn, False
       )
       # reshard given params based on shardings from config in MaxEngine
       params = jax.device_put(params, state_mesh_shardings.params)
       state = maxtext_utils.init_decode_state(None, params)
       state = max_utils.unbox_logicallypartioned(state)
     else:
-      state, self.state_mesh_annotations = maxtext_utils.setup_decode_state(
-          self.model, self.config, rng1, self._mesh, None
-      )
+      if self.config.pure_nnx:
+        # NNX has a different function to init the training state.
+        raise NotImplementedError("Pure NNX support has not been implemented yet.")
+      else:
+        init_state_fn = functools.partial(maxtext_utils.init_initial_state, self.model, None, self.config, False, rng1)
+      state, self.state_mesh_annotations = maxtext_utils.setup_decode_state(self.config, self._mesh, None, init_state_fn)
     # pylint: disable=isinstance-second-argument-not-valid-type
     self.abstract_params = jax.tree_util.tree_map(
         lambda x: jax.ShapeDtypeStruct(shape=x.shape, dtype=x.dtype, sharding=x.sharding)
 
@@ -27,6 +27,7 @@
 from typing import Sequence
 
 from absl import app
+from flax import nnx
 from flax.linen import partitioning as nn_partitioning
 import jax
 from jax.experimental.serialize_executable import serialize
@@ -36,6 +37,7 @@
 from maxtext.configs import pyconfig
 from maxtext.common.common_types import MODEL_MODE_TRAIN, ShardMode
 from maxtext.layers import quantizations
+from maxtext.layers import train_state_nnx
 from maxtext.models import models
 from maxtext.optimizers import optimizers
 from maxtext.trainers.diloco import diloco
@@ -44,6 +46,8 @@
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
 from maxtext.utils import sharding
+from maxtext.utils import maxtext_utils_nnx
+from maxtext.utils import model_creation_utils
 
 # pylint: disable=too-many-positional-arguments
 
@@ -93,7 +97,10 @@ def get_shaped_inputs(topology_mesh, config):
   """Get shaped abstractions of inputs to train_step: state, batch and rng"""
   # Construct the model and optimizer to get shaped versions of the state
   quant = quantizations.configure_quantization(config)
-  model = Transformer(config, topology_mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
+  if config.pure_nnx:
+    _create_model_partial, model = model_creation_utils.create_nnx_abstract_model(config, topology_mesh)
+  else:
+    model = Transformer(config, topology_mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
   # The learning_rate_schedule is baked into the compiled object.
   learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(config)
   # pass in model for muon
@@ -103,18 +110,39 @@ def get_shaped_inputs(topology_mesh, config):
   _, example_rng = jax.random.split(jax.random.PRNGKey(0), 2)
   shaped_rng = jax.ShapeDtypeStruct(example_rng.shape, example_rng.dtype)
 
-  # Shaped state
-  abstract_state, _, state_mesh_shardings = maxtext_utils.get_abstract_state(
-      model, tx, config, example_rng, topology_mesh
-  )
+  if config.pure_nnx:
+
+    def create_train_state_fn():
+      nnx_model = _create_model_partial()
+      optimizer = nnx.Optimizer(nnx_model, tx, wrt=nnx.Param)
+      return train_state_nnx.TrainStateNNX(nnx_model, optimizer)
+
+    init_state_fn = create_train_state_fn
+  else:
+    init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, config, True, example_rng)
 
-  # unsharded logical annotations
-  logical_annotations = maxtext_utils.get_logical_annotations(model, tx, config, example_rng, topology_mesh)
+  # Shaped state
+  abstract_state, _, state_mesh_shardings = maxtext_utils.get_abstract_state(config, topology_mesh, init_state_fn, True)
+
+  if config.pure_nnx:
+    # NNX doesn't use Linen logical annotations; derive PartitionSpecs from the physical shardings.
+    logical_annotations = maxtext_utils_nnx.get_partition_spec_nnx(state_mesh_shardings)
+    # For NNX, get_functional_train_with_signature expects the graphdef (static structure),
+    # not the raw model — mirroring how the training loop does nnx.split(train_state).
+    with nn_partitioning.axis_rules(config.logical_axis_rules):
+      graphdef, _ = nnx.get_abstract_model(init_state_fn, topology_mesh)
+    model = graphdef
+  else:
+    # unsharded logical annotations
+    logical_annotations = maxtext_utils.get_logical_annotations(config, topology_mesh, init_state_fn)
 
   # Shaped batch
   shaped_batch = maxtext_utils.get_shaped_batch(config)
 
-  shaped_train_args = (abstract_state, shaped_batch, shaped_rng)
+  if config.pure_nnx:
+    shaped_train_args = (abstract_state, shaped_batch)
+  else:
+    shaped_train_args = (abstract_state, shaped_batch, shaped_rng)
   shaped_train_kwargs = {}
   return shaped_train_args, shaped_train_kwargs, state_mesh_shardings, logical_annotations, model
 
@@ -277,12 +305,20 @@ def main(argv: Sequence[str]) -> None:
   # print weights sharding info under debug sharding mode
   if config.debug_sharding:
     max_utils.print_non_trivial_mesh_axis(topology_mesh)
-    maxtext_utils.print_shardings_params(
-        shaped_train_args[0].params,
-        state_mesh_shardings.params,
-        topology_mesh,
-        logical_annotations.params,
-    )
+    if config.pure_nnx:
+      maxtext_utils.print_shardings_params(
+          shaped_train_args[0],
+          state_mesh_shardings,
+          topology_mesh,
+          logical_annotations,
+      )
+    else:
+      maxtext_utils.print_shardings_params(
+          shaped_train_args[0].params,
+          state_mesh_shardings.params,
+          topology_mesh,
+          logical_annotations.params,
+      )
 
   # Compile
   print("Jitting and compiling train step...", flush=True)
 
@@ -22,6 +22,7 @@
 The output "parameter state" is output to the checkpoint directory. Additionally it is cast down to bf16.
 """
 
+import functools
 import os.path
 from typing import Sequence
 
@@ -42,8 +43,6 @@
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
 
-Transformer = models.transformer_as_linen
-
 
 def _possibly_unroll_params(config, training_state, training_state_annotations, mesh):
   """Unroll scanned input layers when force_unroll is set."""
@@ -93,12 +92,20 @@ def _read_train_checkpoint(config, checkpoint_manager, mesh):
   """Read training checkpoint at path defined by load_full_state_path."""
   # Model and Optimizer definition
   quant = quantizations.configure_quantization(config)
-  model = Transformer(config, mesh, quant, MODEL_MODE_TRAIN)
+  if config.pure_nnx:
+    raise NotImplementedError("Pure NNX support has not been implemented yet.")
+  else:
+    model = models.transformer_as_linen(config, mesh, quant, MODEL_MODE_TRAIN)
   rng = random.PRNGKey(0)
   learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(config)
   tx = optimizers.get_optimizer(config, learning_rate_schedule)
+  if config.pure_nnx:
+    # NNX has a different function to init the training state.
+    raise NotImplementedError("Pure NNX support has not been implemented yet.")
+  else:
+    init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, config, True, rng)
   state, state_mesh_notations, _, _ = maxtext_utils.setup_training_state(
-      model, None, tx, config, rng, mesh, checkpoint_manager
+      None, config, mesh, checkpoint_manager, init_state_fn
   )
   num_params = max_utils.calculate_num_params_from_pytree(state.params)
   max_logging.log(f"In input checkpoint Number of model params={num_params/1e9:.3f} billion")
@@ -109,7 +116,10 @@ def _generate_lora_decode_checkpoints(config, mesh):
   """Read lora checkpoints checkpoint at path defined by load_full_state_path."""
   # Model and Optimizer definition
   quant = quantizations.configure_quantization(config)
-  model = Transformer(config, mesh, quant, MODEL_MODE_TRAIN)
+  if config.pure_nnx:
+    raise NotImplementedError("Pure NNX support has not been implemented yet.")
+  else:
+    model = models.transformer_as_linen(config, mesh, quant, MODEL_MODE_TRAIN)
   rng = random.PRNGKey(0)
   learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(config)
   tx = optimizers.get_optimizer(config, learning_rate_schedule)
 
@@ -30,6 +30,7 @@
 
 """
 
+import functools
 import os
 from typing import Any, Sequence
 
@@ -174,12 +175,19 @@ def __init__(self, config: Any, rng: PRNGKeyType):
 
     # Model and quantization config
     self.quant = quantizations.configure_quantization(config)
-    model = models.transformer_as_linen(
-        config, mesh=self._mesh, quant=self.quant, model_mode=common_types.MODEL_MODE_TRAIN
-    )
-    self.unboxed_abstract_state, _, _ = maxtext_utils.get_abstract_state(
-        model, None, self.config, self.rng, self._mesh, False
-    )
+    if self.config.pure_nnx:
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      model = models.transformer_as_linen(
+          config, mesh=self._mesh, quant=self.quant, model_mode=common_types.MODEL_MODE_TRAIN
+      )
+    if self.config.pure_nnx:
+      # NNX has a different function to init the training state.
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, None, self.config, False, self.rng)
+
+    self.unboxed_abstract_state, _, _ = maxtext_utils.get_abstract_state(self.config, self._mesh, init_state_fn, False)
 
   def load_and_quantize(self) -> None:
     """
 
@@ -14,6 +14,7 @@
 
 """ Common LoRA utils needed to support LoRA adapters."""
 
+from functools import partial
 import json
 
 import jax
@@ -166,7 +167,12 @@ def setup_initial_lora_state(model, data_iterator, tx, config, rng, mesh, checkp
 
   if lora_adapter_path:
     max_logging.log(f"Setting initial state of LoRA with lora_adapter_path = {lora_adapter_path}")
-    unboxed_abstract_state, _, _ = maxtext_utils.get_abstract_state(model, tx, config, rng, mesh, True)
+    if config.pure_nnx:
+      # NNX has a different function to init the training state.
+      raise NotImplementedError("Pure NNX support has not been implemented yet.")
+    else:
+      init_state_fn = partial(maxtext_utils.init_initial_state, model, tx, config, True, rng)
+    unboxed_abstract_state, _, _ = maxtext_utils.get_abstract_state(config, mesh, init_state_fn, True)
 
     lora_config_path = lora_adapter_path + "adapter_config.json"