NVIDIA · Edwardf0t1 · Jan 9, 2026 · Jan 9, 2026 · Jan 10, 2026 · Jan 10, 2026
@@ -161,6 +161,23 @@ scripts/huggingface_example.sh --model $HF_PATH --quant [fp8|nvfp4|int8_sq|int4_
 
 [PTQ for DeepSeek](../deepseek/README.md) shows how to quantize the DeepSeek model with FP4 and export to TensorRT-LLM.
 
+#### VLM calibration with image-text pairs (e.g., Nemotron VL)
+
+For vision-language models, calibration quality can likely improve by using image-text pairs instead of text-only data, especially on visual understanding tasks:
+
+```bash
+python hf_ptq.py \
+  --pyt_ckpt_path <huggingface_model_card> \
+  --qformat nvfp4 \
+  --export_path <quantized_ckpt_path> \
+  --trust_remote_code \
+  --calib_with_images \
+  --calib_size 512
+```
+
+> Note: when `--calib_with_images` is set, `--calib_size` must be a single value.
+This functionality is currently in beta and has been tested on `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16`.
+
 ### NeMo Example Script
 
 NeMo 2.0 framework PTQ and TensorRT-LLM deployment examples are maintained in the NeMo GitHub repo. Please refer to the [NeMo PTQ documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/model-optimization/quantization/quantization.html) for more details.

@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import argparse
+import inspect
 import random
 import time
 import warnings
@@ -32,6 +33,7 @@
     is_nemotron_vl,
     run_nemotron_vl_preview,
 )
+from nemotron_vl_calib import safe_nemotron_vl_forward
 from torch.utils.data import DataLoader
 from transformers import (
     AutoConfig,
@@ -107,7 +109,30 @@ def make_calib_dataloader(
 ) -> tuple[DataLoader, str | None]:
     calib_dataloader = None
     first_text_speech_dataset = None
-    if model_type == "mllama":
+    if getattr(args, "calib_with_images", False):
+        # VLM image-text calibration path: assume Nemotron VLM dataset by default.
+        assert processor is not None, (
+            "Please provide a processor (e.g., AutoProcessor) for image calibration."
+        )
+        assert len(args.calib_size) == 1, (
+            "Image calibration currently supports a single dataset. "
+            "Please pass --calib_size with one value (e.g., --calib_size 256)."
+        )
+        calib_dataloader = get_vlm_dataset_dataloader(
+            dataset_name="nemotron_vlm_dataset_v2",
+            processor=processor,
+            batch_size=args.batch_size,
+            num_samples=args.calib_size[0],
+            device=device,
+            max_length=args.calib_seq,
+            require_image=True,
+            subsets=["sparsetables", "plotqa_cot", "wiki_en"],
+            shuffle_buffer_size=10_000,
+            seed=42,
+            use_media_shards=True,
+            max_shards=1,
+        )
+    elif model_type == "mllama":
         assert processor is not None and isinstance(processor, MllamaImageProcessor), (
             "The MllamaImageProcessor must be set."
         )
@@ -164,6 +189,12 @@ def auto_quantize(
 ):
     """Auto search quantization of multiple formats."""
 
+    if getattr(args, "calib_with_images", False):
+        raise NotImplementedError(
+            "AutoQuantize with image-text calibration is not supported yet. "
+            "Please run plain PTQ (e.g., --qformat nvfp4) with --calib_with_images."
+        )
+
     assert not (args.auto_quantize_bits and args.inference_pipeline_parallel > 1), (
         "Auto Quantization is not supported for pipeline parallel size > 1"
     )
@@ -291,7 +322,9 @@ def load_model(args: argparse.Namespace):
     tokenizer = None
     language_model = full_model
     default_padding_side = None
+    default_pad_token = None
 
+    is_nemotron_vl_model = is_nemotron_vl(full_model)
     if model_type == "mllama":
         processor = get_processor(
             args.pyt_ckpt_path,
@@ -307,6 +340,47 @@ def load_model(args: argparse.Namespace):
             device,
             trust_remote_code=args.trust_remote_code,
         )
+    elif is_nemotron_vl_model and getattr(args, "calib_with_images", False):
+        # For Nemotron VL image calibration, we need an AutoProcessor to build multimodal inputs.
+        try:
+            processor = AutoProcessor.from_pretrained(
+                args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code, padding_side="left"
+            )
+        except Exception as e:
+            raise RuntimeError(
+                "Failed to load AutoProcessor for Nemotron VL image calibration. "
+                "Please ensure the checkpoint provides a compatible processor."
+            ) from e
+
+        if hasattr(processor, "tokenizer") and processor.tokenizer is not None:
+            tokenizer = processor.tokenizer
+        else:
+            tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)
+
+        default_pad_token = tokenizer.pad_token
+        # Some Nemotron tokenizers may not define pad_token by default; but we use padding=True during calibration.
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        assert tokenizer.pad_token is not None, f"Pad token for {args.pyt_ckpt_path} cannot be set!"
+
+        default_padding_side = tokenizer.padding_side
+        tokenizer.padding_side = "left"
+
+        # Quantize only the language model, but keep the full_model for calibration forward.
+        language_model_lineage = get_language_model_from_vl(full_model)
+        if language_model_lineage is not None:
+            language_model = language_model_lineage.pop(-1)
+            ancestors = language_model_lineage
+            disabled_quant_cfg = {"quant_cfg": {"default": {"enable": False}}, "algorithm": "max"}
+
+            memo = set(ancestors) | {language_model}
+            for ancestor in ancestors:
+                for _, module in ancestor.named_children():
+                    if module not in memo:
+                        mtq.quantize(module, disabled_quant_cfg, forward_loop=None)
+                        memo.add(module)
+
+            model_type = get_model_type(language_model)
     else:
         if args.dataset is None:
             args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
@@ -320,6 +394,7 @@ def load_model(args: argparse.Namespace):
         tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)
 
         default_padding_side = tokenizer.padding_side
+        default_pad_token = tokenizer.pad_token
         # Left padding usually provides better calibration result.
         tokenizer.padding_side = "left"
 
@@ -355,6 +430,7 @@ def load_model(args: argparse.Namespace):
         processor,
         tokenizer,
         default_padding_side,
+        default_pad_token,
         device,
     )
 
@@ -432,9 +508,33 @@ def mono_quantize(
 
         if not use_calibration:
             warnings.warn("Dynamic quantization. Calibration skipped.")
-        calibrate_loop = (
-            create_forward_loop(dataloader=calib_dataloader) if use_calibration else None
-        )
+        calibrate_loop = None
+        if use_calibration:
+            base_forward_loop = create_forward_loop(dataloader=calib_dataloader)
+            # For Nemotron VL image calibration, the dataloader yields multimodal kwargs (e.g., pixel_values).
+            # Those kwargs must be consumed by the *full* VLM model, not the extracted language_model.
+            if getattr(args, "calib_with_images", False) and is_nemotron_vl_model:
+
+                def calibrate_full_model(_model):
+                    forward_params = inspect.signature(full_model.forward).parameters
+                    accepts_kwargs = any(
+                        p.kind == inspect.Parameter.VAR_KEYWORD for p in forward_params.values()
+                    )
+                    allowed_keys = set(forward_params.keys())
+
+                    full_model.eval()
+                    with torch.no_grad():
+                        for batch in calib_dataloader:
+                            if accepts_kwargs:
+                                call_kwargs = batch
+                            else:
+                                call_kwargs = {k: v for k, v in batch.items() if k in allowed_keys}
+                            call_kwargs = {k: v for k, v in call_kwargs.items() if v is not None}
+                            safe_nemotron_vl_forward(full_model, call_kwargs)
+
+                calibrate_loop = calibrate_full_model
+            else:
+                calibrate_loop = base_forward_loop
 
         if calibration_only:
             language_model = mtq.calibrate(
@@ -461,6 +561,7 @@ def export_quantized(
     model_type: str | None,
     tokenizer: PreTrainedTokenizerBase | None,
     default_padding_side,
+    default_pad_token,
 ):
     with torch.inference_mode():
         if model_type is None:
@@ -546,6 +647,8 @@ def export_quantized(
         # Restore default padding and export the tokenizer as well.
         if tokenizer is not None:
             tokenizer.padding_side = default_padding_side
+            if default_pad_token is not None:
+                tokenizer.pad_token = default_pad_token
             tokenizer.save_pretrained(export_path)
 
         end_time = time.time()
@@ -690,6 +793,7 @@ def quantize_main(
     processor: BaseImageProcessor | ProcessorMixin | None,
     tokenizer: PreTrainedTokenizerBase | None,
     default_padding_side,
+    default_pad_token,
     device: torch.device,
 ):
     if args.batch_size == 0:
@@ -805,7 +909,15 @@ def quantize_main(
         is_nemotron_vl_model,
         first_text_speech_dataset,
     )
-    export_quantized(args, full_model, language_model, model_type, tokenizer, default_padding_side)
+    export_quantized(
+        args,
+        full_model,
+        language_model,
+        model_type,
+        tokenizer,
+        default_padding_side,
+        default_pad_token,
+    )
 
 
 def parse_args() -> argparse.Namespace:
@@ -856,6 +968,14 @@ def parse_args() -> argparse.Namespace:
         type=str,
         default=None,
     )
+    parser.add_argument(
+        "--calib_with_images",
+        action="store_true",
+        help=(
+            "Calibrate with image-text pairs (for VLMs). "
+            "This uses nemotron_vlm_dataset_v2 with default subsets (sparsetables, plotqa_cot, wiki_en)."
+        ),
+    )
     parser.add_argument("--inference_tensor_parallel", type=int, default=1)
     parser.add_argument("--inference_pipeline_parallel", type=int, default=1)
     parser.add_argument("--awq_block_size", default=0, type=int)
@@ -993,6 +1113,7 @@ def main(args: argparse.Namespace):
         processor,
         tokenizer,
         default_padding_side,
+        default_pad_token,
         device,
     ) = load_model(args)
 
@@ -1010,6 +1131,7 @@ def main(args: argparse.Namespace):
             processor,
             tokenizer,
             default_padding_side,
+            default_pad_token,
             device,
         )
 
@@ -1020,6 +1142,6 @@ def main(args: argparse.Namespace):
     if args.export_fmt != "hf":
         warnings.warn("Deprecated. --export_fmt forced to hf.")
 
-    args.dataset = args.dataset.split(",") if args.dataset else None
+    args.dataset = args.dataset.split(",") if isinstance(args.dataset, str) else args.dataset
     args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")]
     main(args)
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Nemotron VL calibration helpers.
+
+Nemotron Nano VL v2 remote-code wrapper `forward()` is not ideal to call during PTQ calibration because it may:
+- Call `torch.distributed.get_rank()` unconditionally
+- Assume `past_key_values` exists in the language model output
+
+Instead, we run a "safe multimodal forward" that exercises:
+- Vision encoder feature extraction (C-RADIOv2-H)
+- Insertion of vision embeddings into token embeddings at `img_context_token_id`
+- Language model forward pass (to trigger quantizer calibration)
+"""
+
+from __future__ import annotations
+
+import contextlib
+from typing import Any
+
+import torch
+
+
+def safe_nemotron_vl_forward(full_model: torch.nn.Module, batch: dict[str, Any]) -> None:
+    """Run a minimal multimodal forward for Nemotron VL that avoids wrapper output packaging."""
+    pixel_values = batch.get("pixel_values")
+    input_ids = batch.get("input_ids")
+    attention_mask = batch.get("attention_mask")
+    position_ids = batch.get("position_ids")
+    image_flags = batch.get("image_flags")
+
+    if pixel_values is None or input_ids is None:
+        return
+
+    # Nemotron Nano VL v2 expects `image_flags` in forward(), but the processor doesn't always emit it.
+    # `pixel_values` is flattened across batch*images, so `image_flags` should align with pixel_values.shape[0].
+    if image_flags is None and torch.is_tensor(pixel_values):
+        image_flags = torch.ones(
+            (pixel_values.shape[0], 1), device=pixel_values.device, dtype=torch.long
+        )
+    if image_flags is None:
+        return
+
+    # Match the model's preferred vision dtype (usually bf16).
+    vision_dtype = None
+    with contextlib.suppress(Exception):
+        vision_dtype = getattr(full_model.vision_model.config, "torch_dtype", None)
+    if vision_dtype is None:
+        with contextlib.suppress(Exception):
+            vision_dtype = getattr(full_model.language_model.config, "torch_dtype", None)
+    if (
+        vision_dtype is not None
+        and torch.is_tensor(pixel_values)
+        and pixel_values.dtype != vision_dtype
+    ):
+        pixel_values = pixel_values.to(dtype=vision_dtype)
+
+    # Token embeddings
+    inputs_embeds = full_model.language_model.get_input_embeddings()(input_ids)
+    image_flags_s = image_flags.squeeze(-1)
+
+    b, n, c = inputs_embeds.shape
+    flat_embeds = inputs_embeds.reshape(b * n, c)
+    flat_ids = input_ids.reshape(b * n)
+    selected = flat_ids == full_model.img_context_token_id
+
+    # Vision embeddings
+    vit_embeds = full_model.extract_feature(pixel_values)
+    vit_embeds = vit_embeds[image_flags_s == 1]
+    try:
+        flat_embeds[selected] = flat_embeds[selected] * 0.0 + vit_embeds.reshape(-1, c)
+    except Exception:
+        vit_embeds = vit_embeds.reshape(-1, c)
+        n_token = selected.sum()
+        flat_embeds[selected] = flat_embeds[selected] * 0.0 + vit_embeds[:n_token]
+
+    inputs_embeds = flat_embeds.reshape(b, n, c)
+
+    # LLM forward (drives activation stats)
+    full_model.language_model(
+        inputs_embeds=inputs_embeds,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        use_cache=False,
+        return_dict=False,
+    )