changed input dims in unit tests

prishajain1 · prishajain1 · commit 6e66e8416bc4 · 2026-02-11T09:47:03.000+05:30
diff --git a/src/maxdiffusion/tests/ltx_2_transformer_test.py b/src/maxdiffusion/tests/ltx_2_transformer_test.py
@@ -51,12 +51,38 @@ def setUp(self):
     self.config = config
     devices_array = create_device_mesh(config)
     self.mesh = Mesh(devices_array, config.mesh_axes)
+    
+    # Common dimensions from ltx2_parity_test.py
+    self.batch_size = 1
+    self.num_frames = 4
+    self.height = 32
+    self.width = 32
+    self.patch_size = 1
+    self.patch_size_t = 1
+    
+    self.in_channels = 8
+    self.out_channels = 8
+    self.audio_in_channels = 4
+    
+    # Derived
+    self.seq_len = (self.num_frames // self.patch_size_t) * (self.height // self.patch_size) * (self.width // self.patch_size)
+    
+    # Transformer config (matching parity test)
+    self.dim = 1024
+    self.num_heads = 8
+    self.head_dim = 128
+    self.cross_dim = 1024 # context dim
+    
+    self.audio_dim = 1024
+    self.audio_num_heads = 8
+    self.audio_head_dim = 128
+    self.audio_cross_dim = 1024
 
   def test_ltx2_rope(self):
     """Tests LTX2RotaryPosEmbed output shapes and basic functionality."""
-    dim = 64
-    patch_size = 1
-    patch_size_t = 1
+    dim = self.dim
+    patch_size = self.patch_size
+    patch_size_t = self.patch_size_t
     base_num_frames = 8
     base_height = 32
     base_width = 32
@@ -91,15 +117,14 @@ def test_ltx2_rope(self):
     cos, sin = rope(ids)
     
     # Check output shape
-    # dim=64, so output should be (1, 10, 64)
-    self.assertEqual(cos.shape, (1, 10, 64))
-    self.assertEqual(sin.shape, (1, 10, 64))
+    self.assertEqual(cos.shape, (1, 10, dim))
+    self.assertEqual(sin.shape, (1, 10, dim))
 
   def test_ltx2_ada_layer_norm_single(self):
     """Tests LTX2AdaLayerNormSingle initialization and execution."""
     key = jax.random.key(0)
     rngs = nnx.Rngs(key)
-    embedding_dim = 128
+    embedding_dim = self.dim
     
     with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
       layer = LTX2AdaLayerNormSingle(
@@ -110,7 +135,7 @@ def test_ltx2_ada_layer_norm_single(self):
       )
       
       timestep = jnp.array([1.0])
-      batch_size = 1
+      batch_size = self.batch_size
       
       # Forward
       output, embedded_timestep = layer(timestep)
@@ -125,33 +150,33 @@ def test_ltx2_transformer_block(self):
     key = jax.random.key(0)
     rngs = nnx.Rngs(key)
     
-    dim = 64
-    audio_dim = 32
-    cross_attention_dim = 128
-    audio_cross_attention_dim = 128 # usually same as context
+    dim = self.dim
+    audio_dim = self.audio_dim
+    cross_attention_dim = self.cross_dim
+    audio_cross_attention_dim = self.audio_cross_dim
     
     with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
       block = LTX2VideoTransformerBlock(
           rngs=rngs,
           dim=dim,
-          num_attention_heads=4,
-          attention_head_dim=16,
+          num_attention_heads=self.num_heads,
+          attention_head_dim=self.head_dim,
           cross_attention_dim=cross_attention_dim,
           audio_dim=audio_dim,
-          audio_num_attention_heads=4,
-          audio_attention_head_dim=8,
+          audio_num_attention_heads=self.audio_num_heads,
+          audio_attention_head_dim=self.audio_head_dim,
           audio_cross_attention_dim=audio_cross_attention_dim,
           mesh=self.mesh
       )
       
-      batch_size = 1
-      seq_len = 8
-      audio_seq_len = 4
+      batch_size = self.batch_size
+      seq_len = self.seq_len
+      audio_seq_len = 128 # Matching parity test
       
       hidden_states = jnp.zeros((batch_size, seq_len, dim))
       audio_hidden_states = jnp.zeros((batch_size, audio_seq_len, audio_dim))
-      encoder_hidden_states = jnp.zeros((batch_size, 10, cross_attention_dim))
-      audio_encoder_hidden_states = jnp.zeros((batch_size, 10, audio_cross_attention_dim))
+      encoder_hidden_states = jnp.zeros((batch_size, 128, cross_attention_dim))
+      audio_encoder_hidden_states = jnp.zeros((batch_size, 128, audio_cross_attention_dim))
       
       # Mock modulation parameters
       # sizes based on `transformer_ltx2.py` logic
@@ -185,54 +210,54 @@ def test_ltx2_transformer_model(self):
     key = jax.random.key(0)
     rngs = nnx.Rngs(key)
     
-    in_channels = 128
-    out_channels = 128
-    audio_in_channels = 64
+    in_channels = self.in_channels
+    out_channels = self.out_channels
+    audio_in_channels = self.audio_in_channels
     
     with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
       model = LTX2VideoTransformer3DModel(
           rngs=rngs,
           in_channels=in_channels,
           out_channels=out_channels,
-          patch_size=1,
-          patch_size_t=1,
-          num_attention_heads=4,
-          attention_head_dim=16,
-          cross_attention_dim=64,
-          caption_channels=32,
+          patch_size=self.patch_size,
+          patch_size_t=self.patch_size_t,
+          num_attention_heads=self.num_heads,
+          attention_head_dim=self.head_dim,
+          cross_attention_dim=self.cross_dim,
+          caption_channels=32, # kept small for now, or match parity if needed
           audio_in_channels=audio_in_channels,
           audio_out_channels=audio_in_channels,
-          audio_num_attention_heads=4,
-          audio_attention_head_dim=16,
-          audio_cross_attention_dim=64,
+          audio_num_attention_heads=self.audio_num_heads,
+          audio_attention_head_dim=self.audio_head_dim,
+          audio_cross_attention_dim=self.audio_cross_dim,
           num_layers=1,
           mesh=self.mesh,
           attention_kernel="dot_product" # Force dot_product for test stability on CPU/small config
       )
       
-      batch_size = 1
-      seq_len = 8 # Flattened spatial-temporal tokens
-      audio_seq_len = 4
+      batch_size = self.batch_size
+      seq_len = self.seq_len
+      audio_seq_len = 128
       
       hidden_states = jnp.zeros((batch_size, seq_len, in_channels))
       audio_hidden_states = jnp.zeros((batch_size, audio_seq_len, audio_in_channels))
       
       timestep = jnp.array([1.0])
-      encoder_hidden_states = jnp.zeros((batch_size, 10, 32)) # (B, L, D) match caption_channels
-      audio_encoder_hidden_states = jnp.zeros((batch_size, 10, 32))
+      encoder_hidden_states = jnp.zeros((batch_size, 128, 32)) # (B, L, D) match caption_channels
+      audio_encoder_hidden_states = jnp.zeros((batch_size, 128, 32))
       
-      encoder_attention_mask = jnp.ones((batch_size, 10))
-      audio_encoder_attention_mask = jnp.ones((batch_size, 10))
+      encoder_attention_mask = jnp.ones((batch_size, 128))
+      audio_encoder_attention_mask = jnp.ones((batch_size, 128))
       
       output = model(
           hidden_states=hidden_states,
           audio_hidden_states=audio_hidden_states,
           encoder_hidden_states=encoder_hidden_states,
           audio_encoder_hidden_states=audio_encoder_hidden_states,
           timestep=timestep,
-          num_frames=2,
-          height=2,
-          width=2,
+          num_frames=self.num_frames,
+          height=self.height,
+          width=self.width,
           audio_num_frames=audio_seq_len,
           encoder_attention_mask=encoder_attention_mask,
           audio_encoder_attention_mask=audio_encoder_attention_mask,