Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/MaxText/layers/encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,14 @@ def _setup_vision_encoder_layers(self):
def __call__(self, input_images, deterministic=False):
# vision encoder output, frozen params in many cases
encoder = getattr(self, self.encoder_name)
embeddings = encoder(input_images, deterministic=deterministic)
encoder_output = encoder(input_images, deterministic=deterministic)

deep_feats = None
Comment thread
entrpn marked this conversation as resolved.
if isinstance(encoder_output, tuple):
embeddings = encoder_output[0]
deep_feats = encoder_output[1]
else:
embeddings = encoder_output

if self.config.freeze_vision_encoder_params:
embeddings = jax.lax.stop_gradient(embeddings)
Expand All @@ -73,7 +80,7 @@ def __call__(self, input_images, deterministic=False):
projector = getattr(self, self.projector_name)
embeddings = projector(embeddings)

return embeddings
return embeddings, deep_feats


class AudioEncoder(nnx.Module):
Expand Down
5 changes: 3 additions & 2 deletions src/MaxText/layers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@ def __call__(
audio_embeddings = None

if self.config.use_multimodal and encoder_images is not None:
image_embeddings = self.vision_encoder(input_images=encoder_images, deterministic=not enable_dropout)
# qwen3-omni-30b-a3b returns deep features from the vision encoder.
image_embeddings, _ = self.vision_encoder(input_images=encoder_images, deterministic=not enable_dropout)
Comment thread
entrpn marked this conversation as resolved.
bidirectional_mask = mm_processor.get_bidirectional_mask_vision(self.config, decoder_input_tokens)

if self.config.use_multimodal and encoder_audios is not None and self.audio_encoder is not None:
Expand Down Expand Up @@ -459,7 +460,7 @@ def __call__(
bidirectional_mask = None
image_embeddings = None
if self.config.use_multimodal and encoder_images is not None:
image_embeddings = self.vision_encoder(input_images=encoder_images, deterministic=not enable_dropout)
image_embeddings, _ = self.vision_encoder(input_images=encoder_images, deterministic=not enable_dropout)
bidirectional_mask = mm_processor.get_bidirectional_mask_vision(self.config, decoder_input_tokens)

audio_embeddings = None
Expand Down
Loading
Loading