AI-Hypercomputer
diff --git a/‎src/MaxText/inference/kvcache.py‎
Lines changed: 74 additions & 1 deletion b/‎src/MaxText/inference/kvcache.py‎
Lines changed: 74 additions & 1 deletion
diff --git a/‎src/MaxText/layers/decoders.py‎
Lines changed: 37 additions & 5 deletions b/‎src/MaxText/layers/decoders.py‎
Lines changed: 37 additions & 5 deletions
diff --git a/‎src/MaxText/layers/qwen3.py‎
Lines changed: 83 additions & 23 deletions b/‎src/MaxText/layers/qwen3.py‎
Lines changed: 83 additions & 23 deletions
@@ -230,7 +230,11 @@ def kv_cache_as_linen(
   )
 
 
-class KVCache(nnx.Module):
+class BaseCache(nnx.Module):
+  """Abstract base class for Caches."""
+  pass
+
+class KVCache(BaseCache):
   """Implementation of the KVCache."""
 
   def __init__(
@@ -842,6 +846,75 @@ def __call__(
       return self.kv_cache_autoregressive(key, value, use_ragged_attention)
     else:
       raise ValueError(f"Model Mode isn't supported! {model_mode=}")
+    
+
+class GatedDeltaNetCache(BaseCache):
+  """Cache for Linear Attention (Gated Delta Net).
+  
+  Stores the fixed-size recurrent state and the sliding window state for convolution.
+  """
+
+  def __init__(
+      self,
+      batch: int,
+      num_heads: int,
+      k_head_dim: int,
+      v_head_dim: int,
+      conv_kernel_size: int,
+      conv_dim: int,
+      dtype: DType,
+      cache_batch_axis_name: str = CACHE_BATCH,
+      cache_heads_axis_name: str = CACHE_HEADS,
+  ):
+    self.batch = batch
+    self.dtype = dtype
+
+    # 1. Recurrent State (S) for the Delta Rule
+    # Shape: [Batch, Heads, K_Dim, V_Dim]
+    # We maintain the running state matrix.
+    self.recurrent_state = nnx.Cache(
+        jnp.zeros((int(batch), num_heads, k_head_dim, v_head_dim), dtype=dtype),
+        # Sharding: Batch, Heads, None (K), None (V)
+        sharding=(cache_batch_axis_name, cache_heads_axis_name, None, None)
+    )
+
+    # 2. Convolution State for the 1D Conv
+    # Shape: [Batch, Kernel_Size - 1, Conv_Dim]
+    # We store the last (K-1) inputs to perform the sliding window conv during decoding.
+    self.conv_state = nnx.Cache(
+        jnp.zeros((int(batch), conv_kernel_size - 1, conv_dim), dtype=dtype),
+        # Sharding: Batch, None (Time), None (Dim)
+        sharding=(cache_batch_axis_name, None, None)
+    )
+
+  def __call__(self):
+    """Returns the cache variables for the layer to use."""
+    return self
+  
+
+def gated_delta_net_cache_as_linen(
+    *,
+    batch: int,
+    num_heads: int,
+    head_dim: int,
+    conv_kernel_size: int,
+    conv_dim: int,
+    dtype: DType,
+    name: str | None = None,
+):
+  """Initializes the GatedDeltaNetCache and returns it as a Linen module."""
+  return nnx_wrappers.to_linen(
+      GatedDeltaNetCache,
+      batch=batch,
+      num_heads=num_heads,
+      head_dim=head_dim,
+      conv_kernel_size=conv_kernel_size,
+      conv_dim=conv_dim,
+      dtype=dtype,
+      metadata_fn=variable_to_logically_partitioned,
+      name=name,
+      abstract_init=False,
+  )
 
 
 def mla_kv_cache_as_linen(
 
@@ -874,15 +874,35 @@ def __call__(
                   "is_nope_layer": llama4.determine_is_nope_layer(lyr, self.config.nope_layer_interval),
                   "is_moe_layer": llama4.determine_is_moe_layer(lyr, self.config.interleave_moe_layer_step),
               }
+            kv_cache = None
+            if kv_caches is not None:
+              # For models other than Qwen3-Next, kv_caches is a list of caches.
+              if cfg.decoder_block != DecoderBlockType.QWEN3_NEXT:
+                kv_cache = kv_caches[lyr]
+
             if cfg.decoder_block == DecoderBlockType.QWEN3_NEXT:
               layer_kwargs = {"layer_idx": lyr}
+              # For Qwen3Next, kv_caches is a dictionary of lists of caches.
+              is_full_attention_layer = (lyr + 1) % cfg.inhomogeneous_layer_cycle_interval == 0
+              if kv_caches is not None:
+                if is_full_attention_layer:
+                  k_cache = kv_caches["key_cache"][lyr]
+                  v_cache = kv_caches["value_cache"][lyr]
+                  kv_cache = (k_cache, v_cache)
+                else:
+                  # For GDN layers, the cache is a dictionary.
+                  # conv_state = kv_caches["conv_states"][lyr]
+                  # recurrent_state = kv_caches["recurrent_states"][lyr]
+                  # gdn_cache = {"conv_state": conv_state, "recurrent_state": recurrent_state}
+                  # kv_cache = {"gdn_cache": gdn_cache}
+                  kv_cache = None
+
             if cfg.decoder_block == DecoderBlockType.GPT_OSS:
               layer_kwargs = {"attention_type": gpt_oss.get_attention_type(layer_id=lyr)}
             layer = RemattedBlockLayer(
                 config=cfg, mesh=mesh, name=f"layers_{lyr}", quant=self.quant, model_mode=self.model_mode, **layer_kwargs
             )
-            kv_cache = kv_caches[lyr] if kv_caches is not None else None
-            y, kv_cache = layer(
+            y, returned_cache = layer(
                 y,
                 decoder_segment_ids,
                 decoder_positions,
@@ -895,8 +915,20 @@ def __call__(
                 attention_metadata=attention_metadata,
                 **layer_call_kwargs,
             )
-            if kv_caches is not None and kv_cache is not None:
-              kv_caches[lyr] = kv_cache
+            if kv_caches is not None and returned_cache is not None:
+              if cfg.decoder_block == DecoderBlockType.QWEN3_NEXT:
+                is_full_attention_layer = (lyr + 1) % cfg.inhomogeneous_layer_cycle_interval == 0
+                if is_full_attention_layer:
+                  kv_caches["key_cache"][lyr] = returned_cache[0]
+                  kv_caches["value_cache"][lyr] = returned_cache[1]
+                else:
+                  # gdn_cache = returned_cache.get("gdn_cache")
+                  # if gdn_cache:
+                  #   kv_caches["conv_states"][lyr] = gdn_cache["conv_state"]
+                  #   kv_caches["recurrent_states"][lyr] = gdn_cache["recurrent_state"]
+                  pass
+              else:
+                kv_caches[lyr] = returned_cache
 
     assert isinstance(y, jax.Array)
 
@@ -990,4 +1022,4 @@ def _apply_gemma3_scanned_blocks(
           slot=slot,
           **layer_call_kwargs,
       )
-    return y
+    return y
@@ -28,7 +28,7 @@
 from flax import nnx
 
 from MaxText import max_utils
-from MaxText.common_types import AttentionType, Config, DType, Array, BATCH, LENGTH_NO_EXP, EMBED
+from MaxText.common_types import AttentionType, Config, DType, Array, BATCH, LENGTH_NO_EXP, EMBED, MODEL_MODE_AUTOREGRESSIVE, MODEL_MODE_TRAIN, MODEL_MODE_PREFILL
 from MaxText.layers import attentions
 from MaxText.layers import initializers as max_initializers
 from MaxText.layers import linears
@@ -42,6 +42,7 @@
 from MaxText.layers.attentions import Attention
 from MaxText.layers.linears import DenseGeneral, MlpBlock
 from MaxText.layers.moe import RoutedMoE
+from MaxText.inference import kvcache
 
 
 # -----------------------------------------
@@ -279,7 +280,7 @@ def scan_body(prev_state, x):
   # Transpose back to (B, S, H, D_v)
   core_attn_out = jnp.transpose(core_attn_out, (0, 2, 1, 3)).astype(initial_dtype)
 
-  return core_attn_out, final_state if output_final_state else None
+  return core_attn_out, final_state
 
 
 class Qwen3NextGatedDeltaNet(nnx.Module):
@@ -310,7 +311,7 @@ class Qwen3NextGatedDeltaNet(nnx.Module):
     dtype: The datatype of the computation.
   """
 
-  def __init__(self, config: Config, dtype: DType = jnp.float32, *, rngs: nnx.Rngs):
+  def __init__(self, config: Config, dtype: DType = jnp.float32, model_mode: str = MODEL_MODE_TRAIN, *, rngs: nnx.Rngs):
     self.config = config
     self.dtype = dtype
     cfg = self.config
@@ -326,6 +327,17 @@ def __init__(self, config: Config, dtype: DType = jnp.float32, *, rngs: nnx.Rngs
     conv_kernel_size = cfg.gdn_conv_kernel_dim
     self.v_heads_per_k_head = self.num_v_heads // self.num_k_heads
 
+    if model_mode != MODEL_MODE_TRAIN:
+      self.cache = kvcache.GatedDeltaNetCache(
+          batch=config.per_device_batch_size, # Or appropriate batch dim
+          num_heads=self.num_v_heads,
+          k_head_dim=self.head_k_dim,
+          v_head_dim=self.head_v_dim,
+          conv_kernel_size=self.config.gdn_conv_kernel_dim,
+          conv_dim=conv_dim, # Make sure conv_dim is calculated before this
+          dtype=dtype,
+      )
+
     # Submodule instantiations
     self.in_proj_qkvz = linears.DenseGeneral(
         in_features_shape=in_features,
@@ -381,7 +393,7 @@ def a_log_init(key, shape, dtype=jnp.float32):
         rngs=rngs,
     )
 
-  def __call__(self, hidden_states: Array) -> Array:
+  def __call__(self, hidden_states: Array, model_mode: str = MODEL_MODE_TRAIN, kv_cache = None, **kwargs) -> tuple[Array, dict[str, Array]]:
     # hidden_states: (B, S, E)
     cfg = self.config
     batch, seq_len, _ = hidden_states.shape
@@ -455,12 +467,36 @@ def __call__(self, hidden_states: Array) -> Array:
     # conv_dim = 2 * K_dim + V_dim
     # qkv: (B, S, 2 * K_dim + V_dim)
     qkv = jnp.concatenate([q, k, v], axis=-1)
-
-    # TODO(parambole): Implement caching logic for conv_state and recurrent_state
-
-    # Input to conv_layer should be (B, S, C)
-    # qkv_conv shape: (B, S, conv_dim)
-    conv_out = self.conv1d(qkv)
+    conv_kernel_size = self.config.gdn_conv_kernel_dim
+
+    conv_state = None
+    if model_mode == MODEL_MODE_AUTOREGRESSIVE:
+        # Retrieve state from self.cache instead of input arg
+        conv_state = self.cache.conv_state.value
+        
+        # Concatenate previous state with new input
+        conv_input = jnp.concatenate([conv_state, qkv], axis=1)
+        new_conv_state = conv_input[:, -(conv_kernel_size - 1) :, :]
+        
+        # Update self.cache in place
+        self.cache.conv_state.value = new_conv_state
+    else:
+        # Prefill/Train: pad with zeros
+        conv_input = jnp.pad(qkv, ((0, 0), (conv_kernel_size - 1, 0), (0, 0)))
+        
+        # For prefill, we must initialize the cache for the subsequent decode steps
+        if model_mode == MODEL_MODE_PREFILL:
+             # Store the last K-1 tokens as the initial state for decoding
+             new_conv_state = conv_input[:, -(conv_kernel_size - 1) :, :]
+             self.cache.conv_state.value = new_conv_state
+        else:
+             # Just a placeholder for return
+             new_conv_state = None
+
+    # Perform the convolution.
+    conv_out = self.conv1d(conv_input)
+    # Slice the output to match the original input sequence length.
+    conv_out = conv_out[:, -seq_len:, :]
     qkv_conv = jax.nn.silu(conv_out.astype(jnp.float32)).astype(cfg.dtype)
     # q_conv shape: (B, S, key_dim), k_conv shape: (B, S, key_dim), v_conv shape: (B, S, value_dim)
     q_conv, k_conv, v_conv = jnp.split(qkv_conv, [self.key_dim, 2 * self.key_dim], axis=-1)
@@ -496,10 +532,31 @@ def __call__(self, hidden_states: Array) -> Array:
       pass  # No repeating needed for query/key in this case
 
     # TODO(parambole): Pass and update cache state for jax_chunk_gated_delta_rule
-    # core_attn_out shape: (B, S, H_v, D_v)
-    core_attn_out, _ = jax_chunk_gated_delta_rule(
-        query, key, value, g, beta, chunk_size=cfg.gdn_chunk_size, use_qk_norm_in_gdn=cfg.use_qk_norm_in_gdn
+    recurrent_state = None
+    if model_mode == MODEL_MODE_AUTOREGRESSIVE:
+        # Retrieve state from self.cache
+        recurrent_state = self.cache.recurrent_state.value
+
+    core_attn_out, recurrent_state_out = jax_chunk_gated_delta_rule(
+        query,
+        key,
+        value,
+        g,
+        beta,
+        chunk_size=cfg.gdn_chunk_size,
+        initial_state=recurrent_state,
+        use_qk_norm_in_gdn=cfg.use_qk_norm_in_gdn,
     )
+    
+    if model_mode != MODEL_MODE_TRAIN:
+        # Update self.cache in place for both prefill and decode
+        self.cache.recurrent_state.value = recurrent_state_out
+        
+    # Construct return dictionary for compatibility with decoders.py (optional but safer)
+    new_cache = {
+        "conv_state": self.cache.conv_state.value if model_mode != MODEL_MODE_TRAIN else None, 
+        "recurrent_state": self.cache.recurrent_state.value if model_mode != MODEL_MODE_TRAIN else None
+    }
 
     # =========================================================================
     # STEP D: Final Output Stage
@@ -517,7 +574,7 @@ def __call__(self, hidden_states: Array) -> Array:
     # Final output shape: (B, S, E)
     output = self.out_proj(gated_output)
 
-    return output
+    return output, new_cache
 
 
 class Qwen3NextFullAttention(nnx.Module):
@@ -829,7 +886,7 @@ def __init__(
           rngs=rngs,
       )
     else:
-      self.attention = Qwen3NextGatedDeltaNet(config=cfg, dtype=cfg.dtype, rngs=rngs)
+      self.attention = Qwen3NextGatedDeltaNet(config=cfg, dtype=cfg.dtype, model_mode=model_mode, rngs=rngs)
 
     # Second LayerNorm, applied before the MoE block.
     self.post_attention_layernorm = Qwen3NextRMSNorm(
@@ -853,7 +910,7 @@ def __call__(
       previous_chunk=None,
       page_state: None | page_manager.PageState = None,
       slot: None | int = None,
-      kv_cache: None | jnp.ndarray = None,
+      kv_cache: None | dict[str, Array] = None,
       attention_metadata: None | dict[str, Any] = None,
   ):
     # Unpack inputs if it's a tuple (e.g. from a previous layer returning (hidden_states, kv_cache))
@@ -867,7 +924,7 @@ def __call__(
 
     # Conditionally apply either the Linear Attention or Full Attention block.
     if isinstance(self.attention, Qwen3NextFullAttention):
-      attention_output, kv_cache = cast(Qwen3NextFullAttention, self.attention)(
+      attention_output, new_kv_cache = cast(Qwen3NextFullAttention, self.attention)(
           hidden_states,
           decoder_segment_ids,
           decoder_positions,
@@ -877,9 +934,13 @@ def __call__(
           attention_metadata=attention_metadata,
       )
     elif isinstance(self.attention, Qwen3NextGatedDeltaNet):
-      attention_output = cast(Qwen3NextGatedDeltaNet, self.attention)(hidden_states)
-    else:
-      raise TypeError(f"Unexpected type for self.attention: {type(self.attention)}")
+      gdn_cache = kv_cache.get("gdn_cache") if kv_cache is not None else None
+      attention_output, _ = cast(Qwen3NextGatedDeltaNet, self.attention)(
+        hidden_states, 
+        model_mode=model_mode,
+        kv_cache=None,
+      )
+      new_kv_cache = None
 
     # First residual connection after attention
     hidden_states = residual + attention_output
@@ -906,8 +967,7 @@ def __call__(
         layer_output,
         self.activation_axis_names,
     )
-
-    return layer_output, kv_cache
+    return layer_output, new_kv_cache
 
 
 # -----------------------------------------
@@ -1758,4 +1818,4 @@ def qwen3omni_visionprojector_as_linen(config: Config, mesh: Mesh) -> nn.Module:
 Qwen3NextScannableBlockToLinen = nnx_wrappers.to_linen_class(
     Qwen3NextScannableBlock,
     base_metadata_fn=max_initializers.variable_to_logically_partitioned,
-)
+)