test for rope type split added

prishajain1 · prishajain1 · commit ea8428177d05 · 2026-02-11T11:44:33.000+05:30
diff --git a/src/maxdiffusion/tests/ltx2_parity_test.py b/src/maxdiffusion/tests/ltx2_parity_test.py
@@ -596,5 +596,242 @@ def convert_weight(pt_key_base, jax_key):
     print(f"Audio Std: {jnp.std(max_audio_sample)}")
 
 
+  def test_import_parity_comparison_split(self):
+    """
+    Verifies that the LTX2VideoTransformer3DModel output matches the PyTorch implementation output
+    exactly (to a high precision) given the same inputs and weights, with rope_type='split'.
+    """
+    from flax import traverse_util
+
+    parity_file = "ltx2_parity_data_split.pt"
+    if not os.path.exists(parity_file):
+      print(f"Skipping parity test: {parity_file} not found. Run diffusers test first.")
+      return
+
+    print(f"Loading {parity_file}...")
+    parity_data = torch.load(parity_file)
+    state_dict = parity_data["state_dict"]
+    inputs = parity_data["inputs"]
+    torch_outputs = parity_data["outputs"]
+    config = parity_data["config"]
+
+    # 1. Instantiate Model
+    # Ensure config matches what was exported
+    with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+      model = LTX2VideoTransformer3DModel(
+          rngs=nnx.Rngs(0),
+          in_channels=config["in_channels"],
+          out_channels=config["out_channels"],
+          patch_size=config["patch_size"],
+          patch_size_t=1,
+          num_attention_heads=8,
+          attention_head_dim=128,
+          cross_attention_dim=1024,  # Parity config
+          caption_channels=config["caption_channels"],
+          audio_in_channels=4,
+          audio_out_channels=4,
+          audio_patch_size=1,
+          audio_patch_size_t=1,
+          audio_num_attention_heads=8,
+          audio_attention_head_dim=128,
+          audio_cross_attention_dim=1024,
+          num_layers=1,
+          mesh=self.mesh,
+          attention_kernel="dot_product",
+          rope_type="split",
+      )
+
+    # 2. Convert Weights (PyTorch -> Flax NNX)
+    print("Converting weights (Split)...")
+
+    graph_def, state = nnx.split(model)
+    flat_state = traverse_util.flatten_dict(state.to_pure_dict())
+    new_flat_state = {}
+
+    # Helper to convert/transpose weights
+    def convert_weight(pt_key_base, jax_key):
+      # Try original key first
+      pt_key = pt_key_base
+
+      # Map JAX 'kernel' to PT 'weight'
+      if "kernel" in str(jax_key):
+        pt_key = pt_key.replace("kernel", "weight")
+
+      # Fix scale logic (RMSNorm)
+      # Only replace 'scale' if it's the parameter name (last part) to avoid breaking 'scale_shift'
+      if jax_key[-1] == "scale" and "scale_shift" not in str(jax_key):
+        pt_key = pt_key.replace("scale", "weight")
+
+      # Fix transformer_blocks prefix
+      # JAX: ('transformer_blocks', 'attn1', ...)
+      # PT: transformer_blocks.0.attn1...
+      is_transformer_block = "transformer_blocks" in str(jax_key)
+      if is_transformer_block:
+        if "transformer_blocks" in pt_key and "transformer_blocks.0" not in pt_key:
+          pt_key = pt_key.replace("transformer_blocks", "transformer_blocks.0")
+
+      # Fix `layers` keyword in JAX key usually implies `layers.0` if it was there?
+      if "layers" in pt_key:
+        pt_key = pt_key.replace("layers.", "")
+
+      # Fix to_out (Diffusers has to_out[0] as Linear)
+      if "to_out" in pt_key and ("weight" in pt_key or "bias" in pt_key):
+        pt_key = pt_key.replace("to_out.weight", "to_out.0.weight")
+        pt_key = pt_key.replace("to_out.bias", "to_out.0.bias")
+
+      # Fix FeedForward (net_0 -> net.0.proj, net_2 -> net.2)
+      if "net_0" in pt_key:
+        pt_key = pt_key.replace("net_0", "net.0.proj")
+      if "net_2" in pt_key:
+        pt_key = pt_key.replace("net_2", "net.2")
+
+      if pt_key not in state_dict:
+        # Try removing .0 if it was added erroneously
+        candidates = [pt_key]
+        if "transformer_blocks.0" in pt_key:
+          candidates.append(pt_key.replace("transformer_blocks.0", "transformer_blocks"))
+
+        # Special Case: scale_shift_table
+        # Only allow global scale_shift_table fallback if NOT inside transformer block
+        if "scale_shift_table" in str(jax_key) and not is_transformer_block:
+          candidates.append("scale_shift_table")
+
+        if "audio_scale_shift_table" in str(jax_key) and not is_transformer_block:
+          candidates.append("audio_scale_shift_table")
+
+        for c in candidates:
+          if c in state_dict:
+            pt_key = c
+            break
+        else:
+          # If unmapped bias, maybe it's just missing in PT (e.g. RMSNorm without bias)
+          if "bias" in str(jax_key):
+            # Initialize to zeros?
+            print(f"Warning: Missing PT bias for {jax_key}. initializing to zeros.")
+            # Use shape from current flat_state param
+            return jnp.zeros(flat_state[jax_key].shape), pt_key
+
+          return None, pt_key
+
+      w = state_dict[pt_key].cpu().numpy()
+
+      # Debug Special Parameters
+      if "scale_shift_table" in str(jax_key):
+        print(f"Mapping scale_shift_table for {jax_key} from {pt_key} with shape {w.shape}")
+
+      # Handle vmap/scan dimension for transformer_blocks
+      if is_transformer_block:
+        # JAX expects (num_layers, ...) for these weights
+        # PT has (...)
+        # So expand dims
+        w = w[None, ...]
+
+      # Handle Transforms
+      is_kernel = "kernel" in str(jax_key)
+      # Embedding projections are also 'kernel' in JAX (Linear)
+      if is_kernel:
+        if w.ndim == 3:  # (1, out, in) -> (1, in, out)
+          w = w.transpose(0, 2, 1)
+        elif w.ndim == 2:  # (out, in) -> (in, out)
+          w = w.T
+
+      return jnp.array(w), pt_key
+
+    total_count = len(flat_state)
+    mapped_count = 0
+
+    # Debug: Print available keys for audio_ff
+    print("Debugging PT keys for mapping failure diagnosis (Split):")
+    print("Available PT keys with 'ff':", [k for k in state_dict.keys() if "ff" in k])
+    print("Available PT keys with 'norm':", [k for k in state_dict.keys() if "norm" in k])
+
+    for key in flat_state.keys():
+      # Construct base PT key from JAX key tuple
+      pt_key_base = ".".join([str(k) for k in key if str(k) != "layers"])
+
+      w, used_pt_key = convert_weight(pt_key_base, key)
+      if w is not None:
+        # Handle bias zero init which might return scalar 0 if shape was (1,) but it should be array
+        # jnp.zeros(shape) returns array.
+        new_flat_state[key] = w
+        mapped_count += 1
+      else:
+        print(f"Warning: Could not map JAX key {key} (PT attempt: {used_pt_key})")
+        if "audio_ff" in str(key):
+          print("Available audio_ff keys:", [k for k in state_dict.keys() if "audio_ff" in k])
+        if "norm_out" in str(key):
+          print("Available norm_out keys:", [k for k in state_dict.keys() if "norm_out" in k])
+
+    print(f"Mapped {mapped_count}/{total_count} params.")
+
+    # Update model state
+    new_state = traverse_util.unflatten_dict(new_flat_state)
+    nnx.update(model, new_state)
+
+    # 3. Prepare Inputs
+    jax_inputs = {
+        "hidden_states": jnp.array(inputs["hidden_states"].cpu().numpy()),
+        "audio_hidden_states": jnp.array(inputs["audio_hidden_states"].cpu().numpy()),
+        "encoder_hidden_states": jnp.array(inputs["encoder_hidden_states"].cpu().numpy()),
+        "audio_encoder_hidden_states": jnp.array(inputs["audio_encoder_hidden_states"].cpu().numpy()),
+        "timestep": jnp.array(inputs["timestep"].cpu().numpy()),
+        "encoder_attention_mask": jnp.array(inputs["encoder_attention_mask"].cpu().numpy()),
+        "audio_encoder_attention_mask": jnp.array(inputs["audio_encoder_attention_mask"].cpu().numpy()),
+    }
+
+    print("\n=== Input Verification (Split) ===")
+    print(f"Hidden States Sum: {jnp.sum(jax_inputs['hidden_states'])}")
+    print(f"Audio Hidden States Sum: {jnp.sum(jax_inputs['audio_hidden_states'])}")
+    print(f"Encoder Hidden States Sum: {jnp.sum(jax_inputs['encoder_hidden_states'])}")
+    print(f"Audio Encoder Hidden States Sum: {jnp.sum(jax_inputs['audio_encoder_hidden_states'])}")
+    print(f"Timestep: {jax_inputs['timestep']}")
+    print("==========================\n")
+
+    # 4. Run Forward
+    print("Running MaxDiffusion forward pass (Split)...")
+    output = model(
+        hidden_states=jax_inputs["hidden_states"],
+        audio_hidden_states=jax_inputs["audio_hidden_states"],
+        encoder_hidden_states=jax_inputs["encoder_hidden_states"],
+        audio_encoder_hidden_states=jax_inputs["audio_encoder_hidden_states"],
+        timestep=jax_inputs["timestep"],
+        encoder_attention_mask=jax_inputs["encoder_attention_mask"],
+        audio_encoder_attention_mask=jax_inputs["audio_encoder_attention_mask"],
+        num_frames=config["num_frames"] if "num_frames" in config else 4,
+        height=config["height"] if "height" in config else 32,
+        width=config["width"] if "width" in config else 32,
+        audio_num_frames=128,
+        fps=24.0,
+        return_dict=True,
+    )
+
+    max_sample = output["sample"]
+    max_audio_sample = output["audio_sample"]
+
+    print("MAXDIFF Output Sample Stats (Split):")
+    print(f"Sample Max: {jnp.max(max_sample)}")
+    print(f"Sample Min: {jnp.min(max_sample)}")
+    print(f"Sample Mean: {jnp.mean(max_sample)}")
+    print(f"Sample Std: {jnp.std(max_sample)}")
+
+    print("MAXDIFF Output Audio Sample Stats (Split):")
+    print(f"Audio Max: {jnp.max(max_audio_sample)}")
+    print(f"Audio Min: {jnp.min(max_audio_sample)}")
+    print(f"Audio Mean: {jnp.mean(max_audio_sample)}")
+    print(f"Audio Std: {jnp.std(max_audio_sample)}")
+    
+    # 5. Parity Check
+    parity_sample = jnp.array(torch_outputs["sample"].cpu().numpy())
+    parity_audio_sample = jnp.array(torch_outputs["audio_sample"].cpu().numpy())
+    
+    print("Checking Parity (Split)...")
+    print(f"Max Diff Sample: {jnp.max(jnp.abs(max_sample - parity_sample))}")
+    print(f"Max Diff Audio: {jnp.max(jnp.abs(max_audio_sample - parity_audio_sample))}")
+
+    self.assertTrue(jnp.allclose(max_sample, parity_sample, atol=1e-3))
+    self.assertTrue(jnp.allclose(max_audio_sample, parity_audio_sample, atol=1e-3))
+    print("Parity check passed (Split)!")
+
+
 if __name__ == "__main__":
   unittest.main()