Support gradient_accumulation

Charles Li · Charles · commit 94324cdf3de8 · 2026-03-18T10:33:10.000-07:00
diff --git a/src/maxtext/trainers/pre_train/nnx_train.py b/src/maxtext/trainers/pre_train/nnx_train.py
@@ -101,6 +101,7 @@
 from maxtext.optimizers import optimizers
 from maxtext.utils import exceptions, max_logging, max_utils, maxtext_utils, model_creation_utils, sharding
 from maxtext.utils.globals import EPS
+from maxtext.utils.gradient_accumulation import nnx_gradient_accumulation_loss_and_grad
 from maxtext.utils.rampup_batch import create_rampup_manager
 
 _diag_modules = _cloud_diag()
@@ -287,8 +288,11 @@ def train_step(
   # Compute loss and gradients w.r.t. model parameters.
   # nnx.value_and_grad differentiates only through nnx.Param variables,
   # keeping non-differentiable state (RNGs, cache, etc.) frozen.
-  grad_fn = nnx.value_and_grad(loss_fn, argnums=0, has_aux=True)
-  (loss, aux), raw_grads = grad_fn(model, config, data, dropout_rng, is_train=True)
+  if config.gradient_accumulation_steps > 1:
+    loss, aux, raw_grads = nnx_gradient_accumulation_loss_and_grad(loss_fn, model, config, data, dropout_rng)
+  else:
+    grad_fn = nnx.value_and_grad(loss_fn, argnums=0, has_aux=True)
+    (loss, aux), raw_grads = grad_fn(model, config, data, dropout_rng, is_train=True)
 
   # Cast gradients to configured dtype before clipping / accumulation
   raw_grads = jax.tree.map(
@@ -612,6 +616,7 @@ def train_loop(config, recorder, state=None):
     if config.compiled_trainstep_file == "":
       compiled = p_train_step.lower(model_state, opt_state, shaped_batch, example_rng).compile()
       compiled_stats = compiled.memory_analysis()
+      max_logging.info(f"print_compiled_memory_stats:")
       max_utils.print_compiled_memory_stats(compiled_stats)
 
   # ---- Profiler / logger ----------------------------------------------------
@@ -625,6 +630,7 @@ def train_loop(config, recorder, state=None):
   _job_completed_gracefully = False
   try:
     last_step_completion = datetime.datetime.now()
+    max_logging.info(f"Entering train loop from start_step={start_step}")
 
     for step in np.arange(start_step, config.steps):
       prof.maybe_activate_profiler(step, opt_state)
diff --git a/src/maxtext/utils/gradient_accumulation.py b/src/maxtext/utils/gradient_accumulation.py
@@ -137,6 +137,127 @@ def reshape_to_microbatch_accumulations(batch_arr):
   return loss, aux, raw_grads
 
 
+# ---------------------------------------------------------------------------
+# Gradient accumulation helper for NNX
+# ---------------------------------------------------------------------------
+
+
+def nnx_gradient_accumulation_loss_and_grad(_loss_fn, model, config, data, dropout_rng):
+  """
+  Calculates gradients using gradient accumulation.
+
+  This function computes the gradient of `_loss_fn` over multiple microbatches
+  and accumulates them before returning a single, averaged gradient. It uses
+  `jax.lax.scan` for efficient accumulation on device.
+
+  It also supports a `shard_optimizer_over_data` mode (e.g., ZeRO-1) where
+  parameters are cast to bf16 and sharded *before* the accumulation loop
+  to perform the all-gather in lower precision.
+
+  Args:
+      _loss_fn: The loss function to differentiate. Its signature is expected
+          to be: `(model, config, data, dropout_rng, is_train=True)`.
+      config: Model and training configuration object. Must contain
+          `gradient_accumulation_steps` and `shard_optimizer_over_data`.
+      model: The model module.
+      data: A PyTree of batched data. The leading dimension is assumed
+          to be the total batch size (microbatch_size * num_accumulations).
+      dropout_rng: JAX PRNGKey for dropout.
+      extra_dpo_args: A tuple of extra arguments to pass to the loss function.
+
+  Returns:
+      A tuple containing:
+      - total_loss (Array): The mean loss, averaged over all microbatches.
+      - final_aux (PyTree): Auxiliary outputs, summed across microbatches.
+      - raw_grads (PyTree): The accumulated and averaged gradients.
+  """
+
+  # For more efficient DP/ZeRO-1 + GA
+  # if config.shard_mode == ShardMode.EXPLICIT and config.ici_data_parallelism > 1:
+  #  ga_params_shardings = jax.tree.map(update_sharding_for_reduced, params_shardings)
+  #  grad_shardings = jax.tree.map(update_sharding_for_unreduced, params_shardings)
+  # else:
+  #  ga_params_shardings = grad_shardings = params_shardings
+
+  graphdef, params, rest = nnx.split(model, nnx.Param, ...)
+
+  # When using Zero-1 optimizer sharding, cast params to lower precision and apply sharding constraints
+  # so that all-gather is done once in the lower precision before the gradient accumulation loop
+  if config.shard_optimizer_over_data:
+
+    def convert_to_bf16(param):
+      if param.dtype == jnp.float32:
+        return param.astype(jnp.bfloat16)
+      return param
+
+    ga_params = jax.tree.map(convert_to_bf16, params)
+  else:
+    ga_params = params
+
+  # ga_params = jax.tree.map(_maybe_shard_with_name, ga_params, ga_params_shardings)
+  grad_func = nnx.value_and_grad(_loss_fn, argnums=0, has_aux=True)
+
+  def accumulate_gradient(acc_grad_and_loss, data):
+    ga_params = acc_grad_and_loss["ga_params"]
+    # Reconstruct the model using the fixed parameters (ga_params)
+    # and the advancing non-parameter state (RNGs) from the carry.
+
+    # as ga_params will change during train_step, always create a local_model
+    local_model = nnx.merge(graphdef, ga_params, acc_grad_and_loss["rest_state"])
+    (_, aux), cur_batch_gradient = grad_func(local_model, config, data, dropout_rng, is_train=True)
+    _, _, next_rest_state = nnx.split(local_model, nnx.Param, ...)
+
+    acc_grad_and_loss["rest_state"] = next_rest_state
+    acc_grad_and_loss["loss"] += aux["total_loss"]
+    acc_grad_and_loss["moe_lb_loss"] += aux["moe_lb_loss"]
+    acc_grad_and_loss["mtp_loss"] += aux["mtp_loss"]
+    acc_grad_and_loss["grad"] = jax.tree.map(lambda x, y: x + y, cur_batch_gradient, acc_grad_and_loss["grad"])
+    acc_grad_and_loss["total_weights"] += aux["total_weights"]
+    return acc_grad_and_loss, aux
+
+  def reshape_to_microbatch_accumulations(batch_arr):
+    """Reshape [B, ...] → [num_microbatches, B//num_microbatches, ...]."""
+    num_microbatches = config.gradient_accumulation_steps
+    microbatch_shape = (num_microbatches, batch_arr.shape[0] // num_microbatches) + batch_arr.shape[1:]
+    return jnp.reshape(batch_arr, microbatch_shape)
+
+  # def reshape_to_microbatch_accumulations(batch_arr):
+  #  """Reshape global batch to microbatches, assuming batch axis is leading."""
+  #  num_microbatches = config.gradient_accumulation_steps
+  #  microbatch_shape = (batch_arr.shape[0] // num_microbatches, num_microbatches) + batch_arr.shape[1:]
+  #  reshaped_batch_arr = jnp.reshape(batch_arr, microbatch_shape)
+  #  return jnp.swapaxes(reshaped_batch_arr, 0, 1)
+
+  data = jax.tree.map(reshape_to_microbatch_accumulations, data)
+  init_grad = jax.tree.map(jnp.zeros_like, ga_params)
+  # init_grad = jax.tree.map(_maybe_shard_with_name, init_grad, grad_shardings)
+  init_grad_and_loss = {
+      "loss": 0.0,
+      "grad": init_grad,
+      "total_weights": 0,
+      "moe_lb_loss": 0.0,
+      "mtp_loss": 0.0,
+      "ga_params": ga_params,
+  }
+  init_grad_and_loss["rest_state"] = rest
+
+  grad_and_loss, aux = jax.lax.scan(
+      accumulate_gradient, init_grad_and_loss, data, length=config.gradient_accumulation_steps
+  )
+  loss = (
+      grad_and_loss["loss"] / grad_and_loss["total_weights"]
+      + grad_and_loss["moe_lb_loss"] / config.gradient_accumulation_steps
+      + grad_and_loss["mtp_loss"] / config.gradient_accumulation_steps
+  )
+  raw_grads = grad_and_loss["grad"]
+  raw_grads = jax.tree.map(lambda arr: arr / grad_and_loss["total_weights"], raw_grads)
+  aux = jax.tree.map(lambda x: jnp.sum(x, axis=0), aux)  # pytype: disable=module-attr
+
+  nnx.update(model, grad_and_loss["rest_state"])
+
+  return loss, aux, raw_grads
+
+
 # GA helper functions
 def update_sharding_for_reduced(sharding: NamedSharding) -> NamedSharding:
   """