NVIDIA · sugunav14 · Feb 6, 2026 · Feb 13, 2026 · Feb 4, 2026 · Feb 9, 2026
@@ -60,7 +60,13 @@
     save_expert_token_count_table,
 )
 from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
+from modelopt.torch.export.plugins.vllm_fakequant_hf import export_hf_vllm_fq_checkpoint
 from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration
+from modelopt.torch.quantization.metrics_backup import (
+    ActivationMSELogger,
+    compute_perplexity,
+    get_wikitext2,
+)
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
 from modelopt.torch.utils.dataset_utils import (
@@ -241,6 +247,78 @@ def make_calib_dataloader(
     return calib_dataloader, first_text_speech_dataset
 
 
+def _make_mse_holdout_dataloader(args, tokenizer, device):
+    """Create a hold-out dataloader for activation MSE, excluding calibration samples.
+
+    Uses content-based exclusion: reconstructs the exact calibration texts and
+    filters them from the hold-out set.  This is robust to empty/filtered samples
+    that cause pure index-based skipping to under-count and produce overlap.
+    """
+    from modelopt.torch.utils.dataset_utils import SUPPORTED_DATASET_CONFIG, get_dataset_samples
+
+    dataset_names, calib_sizes = args.dataset, args.calib_size
+    n_mse = args.activation_mse_max_samples
+
+    # 1. Reconstruct the exact calibration texts (same args as make_calib_dataloader).
+    calib_texts: set[str] = set()
+    for ds_name, cs in zip(dataset_names, calib_sizes):
+        calib_texts.update(get_dataset_samples(ds_name, cs, tokenizer=tokenizer))
+
+    # 2. Per-split skip for efficiency (avoids re-iterating calibration range).
+    skip_per_dataset = []
+    for ds_name, cs in zip(dataset_names, calib_sizes):
+        n_splits = len(
+            SUPPORTED_DATASET_CONFIG.get(ds_name, {}).get("config", {}).get("split", [None])
+        )
+        skip_per_dataset.append(-(-cs // max(n_splits, 1)))
+    skip = max(skip_per_dataset)
+
+    # 3. Distribute hold-out samples proportionally across datasets.
+    total_calib = sum(calib_sizes)
+    holdout_sizes = [max(1, int(n_mse * cs / total_calib)) for cs in calib_sizes]
+    holdout_sizes[-1] = n_mse - sum(holdout_sizes[:-1])
+
+    # 4. Collect hold-out texts, requesting extras to replace any filtered overlaps.
+    all_holdout_texts: list[str] = []
+    for ds_name, hs in zip(dataset_names, holdout_sizes):
+        texts = get_dataset_samples(
+            ds_name,
+            hs + len(calib_texts),
+            tokenizer=tokenizer,
+            skip_samples=skip,
+        )
+        filtered = [t for t in texts if t not in calib_texts][:hs]
+        all_holdout_texts.extend(filtered)
+
+    # 5. Tokenize and build dataloader (mirrors get_dataset_dataloader logic).
+    tok = copy.deepcopy(tokenizer)
+    batch_encoded = tok(
+        all_holdout_texts,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=512,
+    )
+    if device:
+        batch_encoded = batch_encoded.to(device)
+
+    class _InputIdsDataset(torch.utils.data.Dataset):
+        def __init__(self, input_ids):
+            self.input_ids = input_ids
+
+        def __getitem__(self, idx):
+            return {"input_ids": self.input_ids[idx]}
+
+        def __len__(self):
+            return len(self.input_ids)
+
+    return DataLoader(
+        _InputIdsDataset(batch_encoded["input_ids"]),
+        batch_size=args.batch_size,
+        shuffle=False,
+    )
+
+
 def auto_quantize(
     args: argparse.Namespace,
     language_model: torch.nn.Module,
@@ -686,11 +764,17 @@ def export_quantized(
             if mtp_layer_prefixes:
                 full_model._mtp_layer_prefixes = mtp_layer_prefixes
 
-            export_hf_checkpoint(
-                full_model,
-                export_dir=export_path,
-                extra_state_dict=mtp_state_dict,
-            )
+            if args.vllm_fakequant_export:
+                export_hf_vllm_fq_checkpoint(
+                    full_model,
+                    export_dir=export_path,
+                )
+            else:
+                export_hf_checkpoint(
+                    full_model,
+                    export_dir=export_path,
+                    extra_state_dict=mtp_state_dict,
+                )
 
         # Restore default padding and export the tokenizer as well.
         if tokenizer is not None:
@@ -747,7 +831,7 @@ def pre_quantize(
             allow_fallback=False,
         )
     else:
-        generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
+        generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=2)
 
     return preview_input_ids, generated_ids_before_ptq
 
@@ -786,7 +870,7 @@ def post_quantize(
         pass
     elif model_type != "llama4" and not is_nemotron_vl_model:
         # Our fake quantizer may not be fully compatible with torch.compile.
-        generated_ids_after_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
+        generated_ids_after_ptq = full_model.generate(preview_input_ids, max_new_tokens=2)
     elif is_nemotron_vl_model and tokenizer is not None:
         generated_ids_after_ptq = run_nemotron_vl_preview(
             full_model,
@@ -910,6 +994,9 @@ def quantize_main(
         args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model
     )
 
+    mse_logger = None
+    mse_data = None
+
     if args.auto_quantize_bits:
         assert len(args.qformat.split(",")) > 1, (
             "Auto quantization needs multiple quantization format."
@@ -937,10 +1024,17 @@ def quantize_main(
                 "Plain quantization supports only one quantization format."
             )
 
-            assert args.qformat in QUANT_CFG_CHOICES, (
-                f"Unsupported quantization format: {args.qformat}, choices are: {list(QUANT_CFG_CHOICES.keys())}"
-            )
-            quant_cfg = QUANT_CFG_CHOICES[args.qformat]
+            if args.qformat in QUANT_CFG_CHOICES:
+                quant_cfg = QUANT_CFG_CHOICES[args.qformat]
+            else:
+                # Fallback: resolve dynamically registered configs from the mtq namespace
+                # (e.g., PSX LUTS configs registered by modelopt-internal plugins).
+                quant_cfg = getattr(mtq, args.qformat, None)
+                assert quant_cfg is not None, (
+                    f"Unsupported quantization format: {args.qformat}, "
+                    f"not found in built-in choices {list(QUANT_CFG_CHOICES.keys())} "
+                    f"or in the mtq namespace (check that the required plugin is installed)."
+                )
 
             quant_cfg = build_quant_cfg(
                 args.qformat,
@@ -976,7 +1070,23 @@ def quantize_main(
             quant_cfg = copy.deepcopy(quant_cfg)
             _set_kv_cache_constant_amax(quant_cfg["quant_cfg"])
 
-        if args.qformat in QUANT_CFG_CHOICES:
+        # Collect original (unquantized) activations before quantization modifies the model
+        if args.measure_activation_mse:
+            mse_logger = ActivationMSELogger(
+                max_samples=args.activation_mse_max_samples,
+                save_dir=args.activation_mse_save_dir,
+            )
+            mse_data = ActivationMSELogger.resolve_data(
+                input_path=args.activation_mse_input_path,
+                calib_dataloader=calib_dataloader,
+                tokenizer=tokenizer,
+                max_samples=args.activation_mse_max_samples,
+                max_length=args.calib_seq,
+                make_holdout_fn=lambda: _make_mse_holdout_dataloader(args, tokenizer, device),
+            )
+            mse_logger.collect(language_model, mse_data, phase="original")
+
+        if args.qformat in QUANT_CFG_CHOICES or hasattr(mtq, args.qformat):
             mono_quantize(
                 args,
                 quant_cfg,
@@ -1002,15 +1112,67 @@ def quantize_main(
         is_nemotron_vl_model,
         first_text_speech_dataset,
     )
-    export_quantized(
-        args,
-        full_model,
-        language_model,
-        model_type,
-        tokenizer,
-        default_padding_side,
-        default_pad_token,
-    )
+
+    if mse_logger is not None:
+        mse_logger.finish(language_model, mse_data)
+        del mse_logger, mse_data
+        torch.cuda.empty_cache()
+
+    if args.eval_perplexity and tokenizer is not None:
+        if args.fold_weights:
+            print("Folding weights before perplexity evaluation...")
+            mtq.fold_weight(language_model)
+        if args.eval_perplexity_input_path:
+            print(f"Loading perplexity eval data from {args.eval_perplexity_input_path}")
+            eval_data = torch.load(
+                args.eval_perplexity_input_path, map_location="cpu", weights_only=True
+            )
+            # Unbatch to [1, seq_len] per element for compute_perplexity batching
+            unbatched = []
+            for t in eval_data:
+                if not isinstance(t, torch.Tensor):
+                    continue
+                if t.dim() == 1:
+                    unbatched.append(t.unsqueeze(0))
+                elif t.shape[0] > 1:
+                    unbatched.extend(t.unbind(0))
+                else:
+                    unbatched.append(t)
+            eval_data = [t.unsqueeze(0) if t.dim() == 1 else t for t in unbatched]
+            print(f"Loaded {len(eval_data)} sequences from {args.eval_perplexity_input_path}")
+            label = args.eval_perplexity_input_path
+        else:
+            eval_data = get_wikitext2(tokenizer, args.eval_perplexity_seq_len)
+            label = "Wikitext-2"
+        ppl = compute_perplexity(full_model, eval_data)
+        print(f"{label} perplexity: {ppl:.2f}")
+
+    # Plugin-registered configs (e.g. PSX LUTS from modelopt-internal) are not exportable
+    # via the standard TRT-LLM / HF export paths. Fall back to save_pretrained().
+    if args.qformat not in QUANT_CFG_CHOICES and hasattr(mtq, args.qformat):
+        export_path = args.export_path
+        if args.vllm_fakequant_export:
+            print(f"Exporting vLLM fakequant checkpoint (bf16 weights + amax) to: {export_path}")
+            export_hf_vllm_fq_checkpoint(full_model, export_dir=export_path)
+        else:
+            print(
+                f"qformat '{args.qformat}' is a plugin-registered config and is not exportable "
+                f"via the standard export pipeline. Saving with save_pretrained() instead."
+            )
+            full_model.save_pretrained(export_path)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(export_path)
+        print(f"Quantized model saved to: {export_path}")
+    else:
+        export_quantized(
+            args,
+            full_model,
+            language_model,
+            model_type,
+            tokenizer,
+            default_padding_side,
+            default_pad_token,
+        )
 
 
 def parse_args() -> argparse.Namespace:
@@ -1064,6 +1226,16 @@ def parse_args() -> argparse.Namespace:
         default=512,
     )
     parser.add_argument("--export_path", default="exported_model")
+    parser.add_argument(
+        "--vllm_fakequant_export",
+        action="store_true",
+        default=False,
+        help=(
+            "Export bf16 weights and amax values separately for vLLM fakequant serving. "
+            "Produces a standard HF checkpoint with GPTQ-adjusted weights plus a "
+            "quant_amax.pth file that can be loaded via AMAX_FILE_PATH in vllm_serve_fakequant.py."
+        ),
+    )
     parser.add_argument(
         "--dataset",
         help=(
@@ -1219,6 +1391,70 @@ def parse_args() -> argparse.Namespace:
         ),
     )
 
+    parser.add_argument(
+        "--eval_perplexity",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Evaluate Wikitext-2 perplexity after quantization (before export).",
+    )
+    parser.add_argument(
+        "--eval_perplexity_seq_len",
+        type=int,
+        default=2048,
+        help="Sequence length for perplexity evaluation (default: 2048).",
+    )
+    parser.add_argument(
+        "--eval_perplexity_input_path",
+        type=str,
+        default=None,
+        help=(
+            "Path to a .pt file containing pre-tokenized evaluation data "
+            "(List[Tensor], each [1, seq_len]) for perplexity evaluation. "
+            "When set, this data is used instead of WikiText-2. "
+            "Compatible with the .pt files produced by create_holdout_mse_inputs.py "
+            "or --activation_mse_input_path."
+        ),
+    )
+    parser.add_argument(
+        "--measure_activation_mse",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Measure per-layer activation MSE (original vs quantized) after quantization.",
+    )
+    parser.add_argument(
+        "--activation_mse_max_samples",
+        type=int,
+        default=16,
+        help="Max calibration samples for activation MSE (default: 16).",
+    )
+    parser.add_argument(
+        "--activation_mse_save_dir",
+        type=str,
+        default=None,
+        help="Directory to save activation MSE results. If not set, results are only printed.",
+    )
+    parser.add_argument(
+        "--activation_mse_input_path",
+        type=str,
+        default=None,
+        help=(
+            "Path to frozen MSE input data. Supports two formats:\n"
+            "  .json — raw text (cross-model reuse): if file exists, loads and re-tokenizes "
+            "with the current model's tokenizer; if not, decodes calibration data to text and saves.\n"
+            "  .pt — tokenized tensors (same-tokenizer reuse): if file exists, loads directly; "
+            "if not, materializes from calibration data and saves."
+        ),
+    )
+    parser.add_argument(
+        "--fold_weights",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help=(
+            "Fold quantized weights before collecting activation MSE. "
+            "Speeds up the quantized forward pass by replacing weights in-place "
+            "and disabling fake-quant, but permanently mutates the weights."
+        ),
+    )
     args = parser.parse_args()
     if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0):
         parser.error("--moe_calib_experts_ratio must be in the range (0.0, 1.0].")

@@ -152,6 +152,7 @@ def disable_compilation(model):
     "quant_cfg": os.environ.get("QUANT_CFG", None),
     "kv_quant_cfg": os.environ.get("KV_QUANT_CFG", None),
     "amax_file_path": os.environ.get("AMAX_FILE_PATH", None),
+    "skip_fold_weight": os.environ.get("SKIP_FOLD_WEIGHT", "0") == "1",
 }
 
 
@@ -329,7 +330,14 @@ def calibrate_loop(model: Any = None) -> None:
     if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
         mtq.print_quant_summary(model)
 
-    mtq.fold_weight(model)
+    if quant_config["skip_fold_weight"]:
+        print("Skipping fold_weight (weights already quantized, e.g. from GPTQ export)")
+        for name, module in model.named_modules():
+            if name.endswith("weight_quantizer"):
+                module.disable()
+    else:
+        mtq.fold_weight(model)
+
     for name, module in model.named_modules():
         if name.endswith("weight_quantizer"):
             assert not module.is_enabled, f"quantizer {name} is still enabled"

@@ -76,6 +76,7 @@
     "QUANT_CFG",
     "AMAX_FILE_PATH",
     "KV_QUANT_CFG",
+    "SKIP_FOLD_WEIGHT",
 }
 
 RayDistributedExecutor.ADDITIONAL_ENV_VARS.update(additional_env_vars)