testing with rope_type = split

prishajain1 · prishajain1 · commit df8a5fc62068 · 2026-02-11T12:16:52.000+05:30
diff --git a/src/maxdiffusion/tests/ltx2_parity_test.py b/src/maxdiffusion/tests/ltx2_parity_test.py
@@ -415,6 +415,7 @@ def test_import_parity_comparison(self):
           num_layers=1,
           mesh=self.mesh,
           attention_kernel="dot_product",
+          rope_type="split"
       )
 
     # 2. Convert Weights (PyTorch -> Flax NNX)
@@ -597,250 +598,5 @@ def convert_weight(pt_key_base, jax_key):
     print(f"Audio Std: {jnp.std(max_audio_sample)}")
 
 
-  def test_import_parity_comparison_split(self):
-    """
-    Verifies that the LTX2VideoTransformer3DModel output matches the PyTorch implementation output
-    exactly (to a high precision) given the same inputs and weights, with rope_type='split'.
-    """
-    print("\n=== Parity Comparison Test (Split) ===")
-    try:
-      import torch
-    except ImportError:
-      print("Skipping parity test: torch not installed.")
-      return
-
-    import os
-    from flax import traverse_util
-
-    parity_file = "ltx2_parity_data_split.pt"
-    if not os.path.exists(parity_file):
-      print(f"Skipping parity test: {parity_file} not found. Run diffusers test first.")
-      return
-
-    print(f"Loading {parity_file}...")
-    parity_data = torch.load(parity_file)
-    state_dict = parity_data["state_dict"]
-    inputs = parity_data["inputs"]
-    torch_outputs = parity_data["outputs"]
-    config = parity_data["config"]
-
-    # 1. Instantiate Model
-    # Ensure config matches what was exported
-    with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
-      model = LTX2VideoTransformer3DModel(
-          rngs=nnx.Rngs(0),
-          in_channels=config["in_channels"],
-          out_channels=config["out_channels"],
-          patch_size=config["patch_size"],
-          patch_size_t=1,
-          num_attention_heads=8,
-          attention_head_dim=128,
-          cross_attention_dim=1024,  # Parity config
-          caption_channels=config["caption_channels"],
-          audio_in_channels=4,
-          audio_out_channels=4,
-          audio_patch_size=1,
-          audio_patch_size_t=1,
-          audio_num_attention_heads=8,
-          audio_attention_head_dim=128,
-          audio_cross_attention_dim=1024,
-          num_layers=1,
-          mesh=self.mesh,
-          attention_kernel="dot_product",
-          rope_type="split",
-      )
-
-    # 2. Convert Weights (PyTorch -> Flax NNX)
-    print("Converting weights (Split)...")
-
-    graph_def, state = nnx.split(model)
-    flat_state = traverse_util.flatten_dict(state.to_pure_dict())
-    new_flat_state = {}
-
-    # Helper to convert/transpose weights
-    def convert_weight(pt_key_base, jax_key):
-      # Try original key first
-      pt_key = pt_key_base
-
-      # Map JAX 'kernel' to PT 'weight'
-      if "kernel" in str(jax_key):
-        pt_key = pt_key.replace("kernel", "weight")
-
-      # Fix scale logic (RMSNorm)
-      # Only replace 'scale' if it's the parameter name (last part) to avoid breaking 'scale_shift'
-      if jax_key[-1] == "scale" and "scale_shift" not in str(jax_key):
-        pt_key = pt_key.replace("scale", "weight")
-
-      # Fix transformer_blocks prefix
-      # JAX: ('transformer_blocks', 'attn1', ...)
-      # PT: transformer_blocks.0.attn1...
-      is_transformer_block = "transformer_blocks" in str(jax_key)
-      if is_transformer_block:
-        if "transformer_blocks" in pt_key and "transformer_blocks.0" not in pt_key:
-          pt_key = pt_key.replace("transformer_blocks", "transformer_blocks.0")
-
-      # Fix `layers` keyword in JAX key usually implies `layers.0` if it was there?
-      if "layers" in pt_key:
-        pt_key = pt_key.replace("layers.", "")
-
-      # Fix to_out (Diffusers has to_out[0] as Linear)
-      if "to_out" in pt_key and ("weight" in pt_key or "bias" in pt_key):
-        pt_key = pt_key.replace("to_out.weight", "to_out.0.weight")
-        pt_key = pt_key.replace("to_out.bias", "to_out.0.bias")
-
-      # Fix FeedForward (net_0 -> net.0.proj, net_2 -> net.2)
-      if "net_0" in pt_key:
-        pt_key = pt_key.replace("net_0", "net.0.proj")
-      if "net_2" in pt_key:
-        pt_key = pt_key.replace("net_2", "net.2")
-
-      if pt_key not in state_dict:
-        # Try removing .0 if it was added erroneously
-        candidates = [pt_key]
-        if "transformer_blocks.0" in pt_key:
-          candidates.append(pt_key.replace("transformer_blocks.0", "transformer_blocks"))
-
-        # Special Case: scale_shift_table
-        # Only allow global scale_shift_table fallback if NOT inside transformer block
-        if "scale_shift_table" in str(jax_key) and not is_transformer_block:
-          candidates.append("scale_shift_table")
-
-        if "audio_scale_shift_table" in str(jax_key) and not is_transformer_block:
-          candidates.append("audio_scale_shift_table")
-
-        for c in candidates:
-          if c in state_dict:
-            pt_key = c
-            break
-        else:
-          # If unmapped bias, maybe it's just missing in PT (e.g. RMSNorm without bias)
-          if "bias" in str(jax_key):
-            # Initialize to zeros?
-            print(f"Warning: Missing PT bias for {jax_key}. initializing to zeros.")
-            # Use shape from current flat_state param
-            return jnp.zeros(flat_state[jax_key].shape), pt_key
-
-          return None, pt_key
-
-      w = state_dict[pt_key].cpu().numpy()
-
-      # Debug Special Parameters
-      if "scale_shift_table" in str(jax_key):
-        print(f"Mapping scale_shift_table for {jax_key} from {pt_key} with shape {w.shape}")
-
-      # Handle vmap/scan dimension for transformer_blocks
-      if is_transformer_block:
-        # JAX expects (num_layers, ...) for these weights
-        # PT has (...)
-        # So expand dims
-        w = w[None, ...]
-
-      # Handle Transforms
-      is_kernel = "kernel" in str(jax_key)
-      # Embedding projections are also 'kernel' in JAX (Linear)
-      if is_kernel:
-        if w.ndim == 3:  # (1, out, in) -> (1, in, out)
-          w = w.transpose(0, 2, 1)
-        elif w.ndim == 2:  # (out, in) -> (in, out)
-          w = w.T
-
-      return jnp.array(w), pt_key
-
-    total_count = len(flat_state)
-    mapped_count = 0
-
-    # Debug: Print available keys for audio_ff
-    print("Debugging PT keys for mapping failure diagnosis (Split):")
-    print("Available PT keys with 'ff':", [k for k in state_dict.keys() if "ff" in k])
-    print("Available PT keys with 'norm':", [k for k in state_dict.keys() if "norm" in k])
-
-    for key in flat_state.keys():
-      # Construct base PT key from JAX key tuple
-      pt_key_base = ".".join([str(k) for k in key if str(k) != "layers"])
-
-      w, used_pt_key = convert_weight(pt_key_base, key)
-      if w is not None:
-        # Handle bias zero init which might return scalar 0 if shape was (1,) but it should be array
-        # jnp.zeros(shape) returns array.
-        new_flat_state[key] = w
-        mapped_count += 1
-      else:
-        print(f"Warning: Could not map JAX key {key} (PT attempt: {used_pt_key})")
-        if "audio_ff" in str(key):
-          print("Available audio_ff keys:", [k for k in state_dict.keys() if "audio_ff" in k])
-        if "norm_out" in str(key):
-          print("Available norm_out keys:", [k for k in state_dict.keys() if "norm_out" in k])
-
-    print(f"Mapped {mapped_count}/{total_count} params.")
-
-    # Update model state
-    new_state = traverse_util.unflatten_dict(new_flat_state)
-    nnx.update(model, new_state)
-
-    # 3. Prepare Inputs
-    jax_inputs = {
-        "hidden_states": jnp.array(inputs["hidden_states"].cpu().numpy()),
-        "audio_hidden_states": jnp.array(inputs["audio_hidden_states"].cpu().numpy()),
-        "encoder_hidden_states": jnp.array(inputs["encoder_hidden_states"].cpu().numpy()),
-        "audio_encoder_hidden_states": jnp.array(inputs["audio_encoder_hidden_states"].cpu().numpy()),
-        "timestep": jnp.array(inputs["timestep"].cpu().numpy()),
-        "encoder_attention_mask": jnp.array(inputs["encoder_attention_mask"].cpu().numpy()),
-        "audio_encoder_attention_mask": jnp.array(inputs["audio_encoder_attention_mask"].cpu().numpy()),
-    }
-
-    print("\n=== Input Verification (Split) ===")
-    print(f"Hidden States Sum: {jnp.sum(jax_inputs['hidden_states'])}")
-    print(f"Audio Hidden States Sum: {jnp.sum(jax_inputs['audio_hidden_states'])}")
-    print(f"Encoder Hidden States Sum: {jnp.sum(jax_inputs['encoder_hidden_states'])}")
-    print(f"Audio Encoder Hidden States Sum: {jnp.sum(jax_inputs['audio_encoder_hidden_states'])}")
-    print(f"Timestep: {jax_inputs['timestep']}")
-    print("==========================\n")
-
-    # 4. Run Forward
-    print("Running MaxDiffusion forward pass (Split)...")
-    output = model(
-        hidden_states=jax_inputs["hidden_states"],
-        audio_hidden_states=jax_inputs["audio_hidden_states"],
-        encoder_hidden_states=jax_inputs["encoder_hidden_states"],
-        audio_encoder_hidden_states=jax_inputs["audio_encoder_hidden_states"],
-        timestep=jax_inputs["timestep"],
-        encoder_attention_mask=jax_inputs["encoder_attention_mask"],
-        audio_encoder_attention_mask=jax_inputs["audio_encoder_attention_mask"],
-        num_frames=config["num_frames"] if "num_frames" in config else 4,
-        height=config["height"] if "height" in config else 32,
-        width=config["width"] if "width" in config else 32,
-        audio_num_frames=128,
-        fps=24.0,
-        return_dict=True,
-    )
-
-    max_sample = output["sample"]
-    max_audio_sample = output["audio_sample"]
-
-    print("MAXDIFF Output Sample Stats (Split):")
-    print(f"Sample Max: {jnp.max(max_sample)}")
-    print(f"Sample Min: {jnp.min(max_sample)}")
-    print(f"Sample Mean: {jnp.mean(max_sample)}")
-    print(f"Sample Std: {jnp.std(max_sample)}")
-
-    print("MAXDIFF Output Audio Sample Stats (Split):")
-    print(f"Audio Max: {jnp.max(max_audio_sample)}")
-    print(f"Audio Min: {jnp.min(max_audio_sample)}")
-    print(f"Audio Mean: {jnp.mean(max_audio_sample)}")
-    print(f"Audio Std: {jnp.std(max_audio_sample)}")
-    
-    # 5. Parity Check
-    parity_sample = jnp.array(torch_outputs["sample"].cpu().numpy())
-    parity_audio_sample = jnp.array(torch_outputs["audio_sample"].cpu().numpy())
-    
-    print("Checking Parity (Split)...")
-    print(f"Max Diff Sample: {jnp.max(jnp.abs(max_sample - parity_sample))}")
-    print(f"Max Diff Audio: {jnp.max(jnp.abs(max_audio_sample - parity_audio_sample))}")
-
-    self.assertTrue(jnp.allclose(max_sample, parity_sample, atol=1e-3))
-    self.assertTrue(jnp.allclose(max_audio_sample, parity_audio_sample, atol=1e-3))
-    print("Parity check passed (Split)!")
-
-
 if __name__ == "__main__":
   unittest.main()