supports flux kontext with multiple input images (#173)

akaitsuki-ii · web-flow · commit 33d00d095772 · 2025-09-26T11:03:30.000+08:00
diff --git a/diffsynth_engine/conf/models/flux/flux_dit.json b/diffsynth_engine/conf/models/flux/flux_dit.json
@@ -101,5 +101,24 @@
             "proj_mlp": "proj_in_besides_attn",
             "proj_out": "proj_out"
         }
-    }
+    },
+    "preferred_kontext_resolutions": [
+        [672, 1568],
+        [688, 1504],
+        [720, 1456],
+        [752, 1392],
+        [800, 1328],
+        [832, 1248],
+        [880, 1184],
+        [944, 1104],
+        [1024, 1024],
+        [1104, 944],
+        [1184, 880],
+        [1248, 832],
+        [1328, 800],
+        [1392, 752],
+        [1456, 720],
+        [1504, 688],
+        [1568, 672]
+    ]
 }
diff --git a/diffsynth_engine/conf/models/flux/flux_vae.json b/diffsynth_engine/conf/models/flux/flux_vae.json
diff --git a/diffsynth_engine/models/flux/flux_controlnet.py b/diffsynth_engine/models/flux/flux_controlnet.py
@@ -119,18 +119,16 @@ def patchify(self, hidden_states):
 
     def forward(
         self,
-        hidden_states,
-        control_condition,
-        control_scale,
-        timestep,
-        prompt_emb,
-        pooled_prompt_emb,
-        guidance,
-        image_ids,
-        text_ids,
+        hidden_states: torch.Tensor,
+        control_condition: torch.Tensor,
+        control_scale: float,
+        timestep: torch.Tensor,
+        prompt_emb: torch.Tensor,
+        pooled_prompt_emb: torch.Tensor,
+        image_ids: torch.Tensor,
+        text_ids: torch.Tensor,
+        guidance: torch.Tensor,
     ):
-        hidden_states = self.patchify(hidden_states)
-        control_condition = self.patchify(control_condition)
         hidden_states = self.x_embedder(hidden_states) + self.controlnet_x_embedder(control_condition)
         condition = (
             self.time_embedder(timestep, hidden_states.dtype)
diff --git a/diffsynth_engine/models/flux/flux_dit.py b/diffsynth_engine/models/flux/flux_dit.py
@@ -2,7 +2,7 @@
 import torch
 import torch.nn as nn
 import numpy as np
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 from einops import rearrange
 
 from diffsynth_engine.models.basic.transformer_helper import (
@@ -245,7 +245,7 @@ def __init__(
         self.ff_a = nn.Sequential(
             nn.Linear(dim, dim * 4, device=device, dtype=dtype),
             nn.GELU(approximate="tanh"),
-            nn.Linear(dim * 4, dim, device=device, dtype=dtype)
+            nn.Linear(dim * 4, dim, device=device, dtype=dtype),
         )
         # Text
         self.norm_msa_b = AdaLayerNormZero(dim, device=device, dtype=dtype)
@@ -395,21 +395,19 @@ def prepare_image_ids(latents: torch.Tensor):
 
     def forward(
         self,
-        hidden_states,
-        timestep,
-        prompt_emb,
-        pooled_prompt_emb,
-        image_emb,
-        guidance,
-        text_ids,
-        image_ids=None,
-        controlnet_double_block_output=None,
-        controlnet_single_block_output=None,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        prompt_emb: torch.Tensor,
+        pooled_prompt_emb: torch.Tensor,
+        image_ids: torch.Tensor,
+        text_ids: torch.Tensor,
+        guidance: torch.Tensor,
+        image_emb: torch.Tensor | None = None,
+        controlnet_double_block_output: List[torch.Tensor] | None = None,
+        controlnet_single_block_output: List[torch.Tensor] | None = None,
         **kwargs,
     ):
-        h, w = hidden_states.shape[-2:]
-        if image_ids is None:
-            image_ids = self.prepare_image_ids(hidden_states)
+        image_seq_len = hidden_states.shape[1]
         controlnet_double_block_output = (
             controlnet_double_block_output if controlnet_double_block_output is not None else ()
         )
@@ -428,10 +426,10 @@ def forward(
                     timestep,
                     prompt_emb,
                     pooled_prompt_emb,
-                    image_emb,
-                    guidance,
-                    text_ids,
                     image_ids,
+                    text_ids,
+                    guidance,
+                    image_emb,
                     *controlnet_double_block_output,
                     *controlnet_single_block_output,
                 ),
@@ -448,7 +446,6 @@ def forward(
             rope_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
             text_rope_emb = rope_emb[:, :, : text_ids.size(1)]
             image_rope_emb = rope_emb[:, :, text_ids.size(1) :]
-            hidden_states = self.patchify(hidden_states)
 
             with sequence_parallel(
                 (
@@ -489,9 +486,8 @@ def forward(
                 hidden_states = hidden_states[:, prompt_emb.shape[1] :]
                 hidden_states = self.final_norm_out(hidden_states, conditioning)
                 hidden_states = self.final_proj_out(hidden_states)
-                (hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(h * w // 4,))
+                (hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(image_seq_len,))
 
-            hidden_states = self.unpatchify(hidden_states, h, w)
             (hidden_states,) = cfg_parallel_unshard((hidden_states,), use_cfg=use_cfg)
             return hidden_states
 
diff --git a/diffsynth_engine/models/flux/flux_dit_fbcache.py b/diffsynth_engine/models/flux/flux_dit_fbcache.py
@@ -1,6 +1,6 @@
 import torch
 import numpy as np
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
@@ -48,21 +48,19 @@ def refresh_cache_status(self, num_inference_steps):
 
     def forward(
         self,
-        hidden_states,
-        timestep,
-        prompt_emb,
-        pooled_prompt_emb,
-        image_emb,
-        guidance,
-        text_ids,
-        image_ids=None,
-        controlnet_double_block_output=None,
-        controlnet_single_block_output=None,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        prompt_emb: torch.Tensor,
+        pooled_prompt_emb: torch.Tensor,
+        image_ids: torch.Tensor,
+        text_ids: torch.Tensor,
+        guidance: torch.Tensor,
+        image_emb: torch.Tensor | None = None,
+        controlnet_double_block_output: List[torch.Tensor] | None = None,
+        controlnet_single_block_output: List[torch.Tensor] | None = None,
         **kwargs,
     ):
-        h, w = hidden_states.shape[-2:]
-        if image_ids is None:
-            image_ids = self.prepare_image_ids(hidden_states)
+        image_seq_len = hidden_states.shape[1]
         controlnet_double_block_output = (
             controlnet_double_block_output if controlnet_double_block_output is not None else ()
         )
@@ -81,10 +79,10 @@ def forward(
                     timestep,
                     prompt_emb,
                     pooled_prompt_emb,
-                    image_emb,
-                    guidance,
-                    text_ids,
                     image_ids,
+                    text_ids,
+                    guidance,
+                    image_emb,
                     *controlnet_double_block_output,
                     *controlnet_single_block_output,
                 ),
@@ -101,7 +99,6 @@ def forward(
             rope_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
             text_rope_emb = rope_emb[:, :, : text_ids.size(1)]
             image_rope_emb = rope_emb[:, :, text_ids.size(1) :]
-            hidden_states = self.patchify(hidden_states)
 
             with sequence_parallel(
                 (
@@ -131,7 +128,7 @@ def forward(
                 first_hidden_states_residual = hidden_states - original_hidden_states
 
                 (first_hidden_states_residual,) = sequence_parallel_unshard(
-                    (first_hidden_states_residual,), seq_dims=(1,), seq_lens=(h * w // 4,)
+                    (first_hidden_states_residual,), seq_dims=(1,), seq_lens=(image_seq_len,)
                 )
 
                 if self.step_count == 0 or self.step_count == (self.num_inference_steps - 1):
@@ -172,9 +169,8 @@ def forward(
 
                 hidden_states = self.final_norm_out(hidden_states, conditioning)
                 hidden_states = self.final_proj_out(hidden_states)
-                (hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(h * w // 4,))
+                (hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(image_seq_len,))
 
-            hidden_states = self.unpatchify(hidden_states, h, w)
             (hidden_states,) = cfg_parallel_unshard((hidden_states,), use_cfg=use_cfg)
 
             return hidden_states
diff --git a/diffsynth_engine/models/flux/flux_vae.py b/diffsynth_engine/models/flux/flux_vae.py
@@ -25,11 +25,29 @@ def _from_civitai(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.
             new_state_dict[name_] = param
         return new_state_dict
 
+    def _from_diffusers(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        rename_dict = config["diffusers"]["rename_dict"]
+        new_state_dict = {}
+        for name, param in state_dict.items():
+            if name not in rename_dict:
+                continue
+            name_ = rename_dict[name]
+            if "transformer_blocks" in name_:
+                param = param.squeeze()
+            new_state_dict[name_] = param
+        return new_state_dict
+
     def convert(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         assert self.has_decoder or self.has_encoder, "Either decoder or encoder must be present"
-        if "decoder.conv_in.weight" in state_dict or "encoder.conv_in.weight" in state_dict:
+        if "decoder.up.0.block.0.conv1.weight" in state_dict or "encoder.down.0.block.0.conv1.weight" in state_dict:
             state_dict = self._from_civitai(state_dict)
             logger.info("use civitai format state dict")
+        elif (
+            "decoder.up_blocks.0.resnets.0.conv1.weight" in state_dict
+            or "encoder.down_blocks.0.resnets.0.conv1.weight" in state_dict
+        ):
+            state_dict = self._from_diffusers(state_dict)
+            logger.info("use diffusers format state dict")
         else:
             logger.info("use diffsynth format state dict")
         return self._filter(state_dict)
diff --git a/diffsynth_engine/pipelines/flux_image.py b/diffsynth_engine/pipelines/flux_image.py