Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions examples/llm_ptq/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,23 @@ scripts/huggingface_example.sh --model $HF_PATH --quant [fp8|nvfp4|int8_sq|int4_

[PTQ for DeepSeek](../deepseek/README.md) shows how to quantize the DeepSeek model with FP4 and export to TensorRT-LLM.

#### VLM calibration with image-text pairs (e.g., Nemotron VL)

For vision-language models, calibration quality can likely improve by using image-text pairs instead of text-only data, especially on visual understanding tasks:

```bash
python hf_ptq.py \
--pyt_ckpt_path <huggingface_model_card> \
--qformat nvfp4 \
--export_path <quantized_ckpt_path> \
--trust_remote_code \
--calib_with_images \
--calib_size 512
```

> Note: when `--calib_with_images` is set, `--calib_size` must be a single value.
This functionality is currently in beta and has been tested on `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16`.

### NeMo Example Script

NeMo 2.0 framework PTQ and TensorRT-LLM deployment examples are maintained in the NeMo GitHub repo. Please refer to the [NeMo PTQ documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/model-optimization/quantization/quantization.html) for more details.
Expand Down
134 changes: 128 additions & 6 deletions examples/llm_ptq/hf_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.

import argparse
import inspect
import random
import time
import warnings
Expand All @@ -32,6 +33,7 @@
is_nemotron_vl,
run_nemotron_vl_preview,
)
from nemotron_vl_calib import safe_nemotron_vl_forward
from torch.utils.data import DataLoader
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -107,7 +109,30 @@ def make_calib_dataloader(
) -> tuple[DataLoader, str | None]:
calib_dataloader = None
first_text_speech_dataset = None
if model_type == "mllama":
if getattr(args, "calib_with_images", False):
# VLM image-text calibration path: assume Nemotron VLM dataset by default.
assert processor is not None, (
"Please provide a processor (e.g., AutoProcessor) for image calibration."
)
assert len(args.calib_size) == 1, (
"Image calibration currently supports a single dataset. "
"Please pass --calib_size with one value (e.g., --calib_size 256)."
)
calib_dataloader = get_vlm_dataset_dataloader(
dataset_name="nemotron_vlm_dataset_v2",
processor=processor,
batch_size=args.batch_size,
num_samples=args.calib_size[0],
device=device,
max_length=args.calib_seq,
require_image=True,
subsets=["sparsetables", "plotqa_cot", "wiki_en"],
shuffle_buffer_size=10_000,
seed=42,
use_media_shards=True,
max_shards=1,
)
elif model_type == "mllama":
assert processor is not None and isinstance(processor, MllamaImageProcessor), (
"The MllamaImageProcessor must be set."
)
Expand Down Expand Up @@ -164,6 +189,12 @@ def auto_quantize(
):
"""Auto search quantization of multiple formats."""

if getattr(args, "calib_with_images", False):
raise NotImplementedError(
"AutoQuantize with image-text calibration is not supported yet. "
"Please run plain PTQ (e.g., --qformat nvfp4) with --calib_with_images."
)

assert not (args.auto_quantize_bits and args.inference_pipeline_parallel > 1), (
"Auto Quantization is not supported for pipeline parallel size > 1"
)
Expand Down Expand Up @@ -291,7 +322,9 @@ def load_model(args: argparse.Namespace):
tokenizer = None
language_model = full_model
default_padding_side = None
default_pad_token = None

is_nemotron_vl_model = is_nemotron_vl(full_model)
if model_type == "mllama":
processor = get_processor(
args.pyt_ckpt_path,
Expand All @@ -307,6 +340,47 @@ def load_model(args: argparse.Namespace):
device,
trust_remote_code=args.trust_remote_code,
)
elif is_nemotron_vl_model and getattr(args, "calib_with_images", False):
# For Nemotron VL image calibration, we need an AutoProcessor to build multimodal inputs.
try:
processor = AutoProcessor.from_pretrained(
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code, padding_side="left"
)
except Exception as e:
raise RuntimeError(
"Failed to load AutoProcessor for Nemotron VL image calibration. "
"Please ensure the checkpoint provides a compatible processor."
) from e

if hasattr(processor, "tokenizer") and processor.tokenizer is not None:
tokenizer = processor.tokenizer
else:
tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)

default_pad_token = tokenizer.pad_token
# Some Nemotron tokenizers may not define pad_token by default; but we use padding=True during calibration.
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
assert tokenizer.pad_token is not None, f"Pad token for {args.pyt_ckpt_path} cannot be set!"

default_padding_side = tokenizer.padding_side
tokenizer.padding_side = "left"

# Quantize only the language model, but keep the full_model for calibration forward.
language_model_lineage = get_language_model_from_vl(full_model)
if language_model_lineage is not None:
language_model = language_model_lineage.pop(-1)
ancestors = language_model_lineage
disabled_quant_cfg = {"quant_cfg": {"default": {"enable": False}}, "algorithm": "max"}

memo = set(ancestors) | {language_model}
for ancestor in ancestors:
for _, module in ancestor.named_children():
if module not in memo:
mtq.quantize(module, disabled_quant_cfg, forward_loop=None)
memo.add(module)

model_type = get_model_type(language_model)
else:
if args.dataset is None:
args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
Expand All @@ -320,6 +394,7 @@ def load_model(args: argparse.Namespace):
tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)

default_padding_side = tokenizer.padding_side
default_pad_token = tokenizer.pad_token
# Left padding usually provides better calibration result.
tokenizer.padding_side = "left"

Expand Down Expand Up @@ -355,6 +430,7 @@ def load_model(args: argparse.Namespace):
processor,
tokenizer,
default_padding_side,
default_pad_token,
device,
)

Expand Down Expand Up @@ -432,9 +508,33 @@ def mono_quantize(

if not use_calibration:
warnings.warn("Dynamic quantization. Calibration skipped.")
calibrate_loop = (
create_forward_loop(dataloader=calib_dataloader) if use_calibration else None
)
calibrate_loop = None
if use_calibration:
base_forward_loop = create_forward_loop(dataloader=calib_dataloader)
# For Nemotron VL image calibration, the dataloader yields multimodal kwargs (e.g., pixel_values).
# Those kwargs must be consumed by the *full* VLM model, not the extracted language_model.
if getattr(args, "calib_with_images", False) and is_nemotron_vl_model:

def calibrate_full_model(_model):
forward_params = inspect.signature(full_model.forward).parameters
accepts_kwargs = any(
p.kind == inspect.Parameter.VAR_KEYWORD for p in forward_params.values()
)
allowed_keys = set(forward_params.keys())

full_model.eval()
with torch.no_grad():
for batch in calib_dataloader:
if accepts_kwargs:
call_kwargs = batch
else:
call_kwargs = {k: v for k, v in batch.items() if k in allowed_keys}
call_kwargs = {k: v for k, v in call_kwargs.items() if v is not None}
safe_nemotron_vl_forward(full_model, call_kwargs)

calibrate_loop = calibrate_full_model
else:
calibrate_loop = base_forward_loop

if calibration_only:
language_model = mtq.calibrate(
Expand All @@ -461,6 +561,7 @@ def export_quantized(
model_type: str | None,
tokenizer: PreTrainedTokenizerBase | None,
default_padding_side,
default_pad_token,
):
with torch.inference_mode():
if model_type is None:
Expand Down Expand Up @@ -546,6 +647,8 @@ def export_quantized(
# Restore default padding and export the tokenizer as well.
if tokenizer is not None:
tokenizer.padding_side = default_padding_side
if default_pad_token is not None:
tokenizer.pad_token = default_pad_token
tokenizer.save_pretrained(export_path)

end_time = time.time()
Expand Down Expand Up @@ -690,6 +793,7 @@ def quantize_main(
processor: BaseImageProcessor | ProcessorMixin | None,
tokenizer: PreTrainedTokenizerBase | None,
default_padding_side,
default_pad_token,
device: torch.device,
):
if args.batch_size == 0:
Expand Down Expand Up @@ -805,7 +909,15 @@ def quantize_main(
is_nemotron_vl_model,
first_text_speech_dataset,
)
export_quantized(args, full_model, language_model, model_type, tokenizer, default_padding_side)
export_quantized(
args,
full_model,
language_model,
model_type,
tokenizer,
default_padding_side,
default_pad_token,
)


def parse_args() -> argparse.Namespace:
Expand Down Expand Up @@ -856,6 +968,14 @@ def parse_args() -> argparse.Namespace:
type=str,
default=None,
)
parser.add_argument(
"--calib_with_images",
action="store_true",
help=(
"Calibrate with image-text pairs (for VLMs). "
"This uses nemotron_vlm_dataset_v2 with default subsets (sparsetables, plotqa_cot, wiki_en)."
),
)
parser.add_argument("--inference_tensor_parallel", type=int, default=1)
parser.add_argument("--inference_pipeline_parallel", type=int, default=1)
parser.add_argument("--awq_block_size", default=0, type=int)
Expand Down Expand Up @@ -993,6 +1113,7 @@ def main(args: argparse.Namespace):
processor,
tokenizer,
default_padding_side,
default_pad_token,
device,
) = load_model(args)

Expand All @@ -1010,6 +1131,7 @@ def main(args: argparse.Namespace):
processor,
tokenizer,
default_padding_side,
default_pad_token,
device,
)

Expand All @@ -1020,6 +1142,6 @@ def main(args: argparse.Namespace):
if args.export_fmt != "hf":
warnings.warn("Deprecated. --export_fmt forced to hf.")

args.dataset = args.dataset.split(",") if args.dataset else None
args.dataset = args.dataset.split(",") if isinstance(args.dataset, str) else args.dataset
args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")]
main(args)
98 changes: 98 additions & 0 deletions examples/llm_ptq/nemotron_vl_calib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Nemotron VL calibration helpers.

Nemotron Nano VL v2 remote-code wrapper `forward()` is not ideal to call during PTQ calibration because it may:
- Call `torch.distributed.get_rank()` unconditionally
- Assume `past_key_values` exists in the language model output

Instead, we run a "safe multimodal forward" that exercises:
- Vision encoder feature extraction (C-RADIOv2-H)
- Insertion of vision embeddings into token embeddings at `img_context_token_id`
- Language model forward pass (to trigger quantizer calibration)
"""

from __future__ import annotations

import contextlib
from typing import Any

import torch


def safe_nemotron_vl_forward(full_model: torch.nn.Module, batch: dict[str, Any]) -> None:
"""Run a minimal multimodal forward for Nemotron VL that avoids wrapper output packaging."""
pixel_values = batch.get("pixel_values")
input_ids = batch.get("input_ids")
attention_mask = batch.get("attention_mask")
position_ids = batch.get("position_ids")
image_flags = batch.get("image_flags")

if pixel_values is None or input_ids is None:
return

# Nemotron Nano VL v2 expects `image_flags` in forward(), but the processor doesn't always emit it.
# `pixel_values` is flattened across batch*images, so `image_flags` should align with pixel_values.shape[0].
if image_flags is None and torch.is_tensor(pixel_values):
image_flags = torch.ones(
(pixel_values.shape[0], 1), device=pixel_values.device, dtype=torch.long
)
if image_flags is None:
return

# Match the model's preferred vision dtype (usually bf16).
vision_dtype = None
with contextlib.suppress(Exception):
vision_dtype = getattr(full_model.vision_model.config, "torch_dtype", None)
if vision_dtype is None:
with contextlib.suppress(Exception):
vision_dtype = getattr(full_model.language_model.config, "torch_dtype", None)
if (
vision_dtype is not None
and torch.is_tensor(pixel_values)
and pixel_values.dtype != vision_dtype
):
pixel_values = pixel_values.to(dtype=vision_dtype)

# Token embeddings
inputs_embeds = full_model.language_model.get_input_embeddings()(input_ids)
image_flags_s = image_flags.squeeze(-1)

b, n, c = inputs_embeds.shape
flat_embeds = inputs_embeds.reshape(b * n, c)
flat_ids = input_ids.reshape(b * n)
selected = flat_ids == full_model.img_context_token_id

# Vision embeddings
vit_embeds = full_model.extract_feature(pixel_values)
vit_embeds = vit_embeds[image_flags_s == 1]
try:
flat_embeds[selected] = flat_embeds[selected] * 0.0 + vit_embeds.reshape(-1, c)
except Exception:
vit_embeds = vit_embeds.reshape(-1, c)
n_token = selected.sum()
flat_embeds[selected] = flat_embeds[selected] * 0.0 + vit_embeds[:n_token]

inputs_embeds = flat_embeds.reshape(b, n, c)

# LLM forward (drives activation stats)
full_model.language_model(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
position_ids=position_ids,
use_cache=False,
return_dict=False,
)
Loading