diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 12118e20b..15376996b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -17,7 +17,6 @@ modelopt/deploy @NVIDIA/modelopt-deploy-codeowners modelopt/onnx @NVIDIA/modelopt-onnx-codeowners modelopt/onnx/autocast @NVIDIA/modelopt-onnx-autocast-codeowners modelopt/torch @NVIDIA/modelopt-torch-codeowners -modelopt/torch/_compress @NVIDIA/modelopt-torch-compress-codeowners modelopt/torch/_deploy @NVIDIA/modelopt-torch-deploy-codeowners modelopt/torch/distill @NVIDIA/modelopt-torch-distill-codeowners modelopt/torch/export @NVIDIA/modelopt-torch-export-codeowners @@ -25,6 +24,7 @@ modelopt/torch/nas @NVIDIA/modelopt-torch-nas-prune-codeowners modelopt/torch/opt @NVIDIA/modelopt-torch-opt-codeowners modelopt/torch/peft @NVIDIA/modelopt-torch-peft-codeowners modelopt/torch/prune @NVIDIA/modelopt-torch-nas-prune-codeowners +modelopt/torch/puzzletron @NVIDIA/modelopt-torch-puzzletron-codeowners modelopt/torch/quantization @NVIDIA/modelopt-torch-quantization-codeowners modelopt/torch/sparsity @NVIDIA/modelopt-torch-sparsity-codeowners modelopt/torch/speculative @NVIDIA/modelopt-torch-speculative-codeowners diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 70bae3609..c1895d943 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,17 +24,17 @@ repos: hooks: - id: ruff-check args: [--fix, --exit-non-zero-on-fix] - # See: commit hooks modifies block_config.py leading to test_compress.py failing (#25) · Issues · omniml / modelopt · GitLab + # See: commit hooks modifies block_config.py leading to test_puzzletron.py failing (#25) · Issues · omniml / modelopt · GitLab exclude: > (?x)^( - modelopt/torch/_compress/decilm/deci_lm_hf_code/block_config\.py| - modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_.*\.py + modelopt/torch/puzzletron/decilm/deci_lm_hf_code/block_config\.py| + modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py )$ - id: ruff-format exclude: > (?x)^( - modelopt/torch/_compress/decilm/deci_lm_hf_code/block_config\.py| - modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_.*\.py + modelopt/torch/puzzletron/decilm/deci_lm_hf_code/block_config\.py| + modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py )$ - repo: https://github.com/pre-commit/mirrors-mypy @@ -107,7 +107,7 @@ repos: examples/speculative_decoding/main.py| examples/speculative_decoding/medusa_utils.py| examples/speculative_decoding/server_generate.py| - modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_.*\.py| + modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py| )$ # Default hook for Apache 2.0 in c/c++/cuda files diff --git a/examples/pruning/README.md b/examples/pruning/README.md index 2c2a7c785..9792f2932 100644 --- a/examples/pruning/README.md +++ b/examples/pruning/README.md @@ -7,6 +7,7 @@ Pruning can involve removal (prune) of Linear and Conv layers; and Transformer a This section focuses on applying Model Optimizer's state-of-the-art complementary pruning modes to enable you to search for the best subnet architecture from your provided base model: 1. [Minitron](https://arxiv.org/pdf/2408.11796): A pruning method developed by NVIDIA Research for pruning GPT (and later extended to Mamba, MoE, and Hybrid Transformer Mamba) models in NVIDIA Megatron-LM or NeMo framework. It uses the activation magnitudes to prune the embedding hidden size; mlp ffn hidden size; transformer attention heads; mamba heads and head dimension; MoE number of experts, ffn hidden size, and shared expert intermediate size; and number of layers of the model. +1. [Puzzletron](../puzzletron/README.md): An advanced pruning method by NVIDIA using Mixed Integer Programming (MIP) based NAS search algorithm. 1. FastNAS: A pruning method recommended for Computer Vision models. Given a pretrained model, FastNAS finds the subnet which maximizes the score function while meeting the given constraints. 1. GradNAS: A light-weight pruning method recommended for language models like Hugging Face BERT, GPT-J. It uses the gradient information to prune the model's linear layers and attention heads to meet the given constraints. @@ -23,8 +24,6 @@ This section focuses on applying Model Optimizer's state-of-the-art complementar -For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](../compress/README.md). - ## Pre-Requisites For Minitron pruning for Megatron-LM / NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.09`) which has all the dependencies installed. diff --git a/examples/compress/README.md b/examples/puzzletron/README.md similarity index 86% rename from examples/compress/README.md rename to examples/puzzletron/README.md index 42e55892e..e3a909d22 100644 --- a/examples/compress/README.md +++ b/examples/puzzletron/README.md @@ -1,6 +1,6 @@ -# Compress Algorithm Tutorial +# Puzzletron Algorithm Tutorial -This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146). +This tutorial demonstrates how to compress large language models using the puzzletron algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146). The goal of the algorithm it to find the most optimal modifications to MLP and attention layers of the model, resulting in a heterogeneous model architecture. The supported modifications are: @@ -16,7 +16,7 @@ In this example, we compress the [Llama-3.1-8B-Instruct](https://huggingface.co/ - Install Model-Optimizer in editable mode with the corresponding dependencies: ```bash -pip install -e .[hf,compress] +pip install -e .[hf,puzzletron] ``` - For this example we are using 2x NVIDIA H100 80GB HBM3 to show multi-GPU steps. You can use also use s single GPU. @@ -34,7 +34,7 @@ hf auth login dataset split: "code", "math", "stem", "chat", excluding reasoning samples (2.62GB) ```bash - python -m modelopt.torch._compress.dataset.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2 + python -m modelopt.torch.puzzletron.dataset.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2 ``` 2. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file. @@ -51,23 +51,23 @@ hf auth login We can also set the target size of the resulting model using `num_params = 7_000_000_000`. This will be used as an upper bound for the number of parameters of the model. -3. Run the compression script. +3. Run the puzzletron pipeline. ```bash - torchrun --nproc_per_node 2 examples/compress/main.py --config examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress" + torchrun --nproc_per_node 2 examples/puzzletron/main.py --config examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Puzzletron Progress" ``` This will save the full output to `log.txt` and display the following progress on screen: ```bash - [2025-11-02 12:06:34][rank-0][main.py:71] Compress Progress 1/8: starting compression pipeline - [2025-11-02 12:06:45][rank-0][compress_nas_plugin.py:123] Compress Progress 2/8: converting model from HF to DeciLM (single-gpu) - [2025-11-02 12:07:07][rank-0][compress_nas_plugin.py:132] Compress Progress 3/8: scoring pruning activations (multi-gpu) - [2025-11-02 12:11:36][rank-0][compress_nas_plugin.py:137] Compress Progress 4/8: pruning the model and saving pruned checkpoints (single-gpu) - [2025-11-02 12:12:20][rank-0][compress_nas_plugin.py:217] Compress Progress 5/8: building replacement library and subblock statistics (single-gpu) - [2025-11-02 12:12:21][rank-0][compress_nas_plugin.py:222] Compress Progress 6/8: calculating one block scores (multi-gpu) - [2025-11-02 12:50:41][rank-0][compress_nas_plugin.py:226] Compress Progress 7/8: running MIP and realizing models (multi-gpu) - [2025-11-02 12:52:34][rank-0][main.py:115] Compress Progress 8/8: compression pipeline completed (multi-gpu) + [2025-11-02 12:06:34][rank-0][main.py:71] Puzzletron Progress 1/8: starting puzzletron pipeline + [2025-11-02 12:06:45][rank-0][puzzletron_nas_plugin.py:123] Puzzletron Progress 2/8: converting model from HF to DeciLM (single-gpu) + [2025-11-02 12:07:07][rank-0][puzzletron_nas_plugin.py:132] Puzzletron Progress 3/8: scoring pruning activations (multi-gpu) + [2025-11-02 12:11:36][rank-0][puzzletron_nas_plugin.py:137] Puzzletron Progress 4/8: pruning the model and saving pruned checkpoints (single-gpu) + [2025-11-02 12:12:20][rank-0][puzzletron_nas_plugin.py:217] Puzzletron Progress 5/8: building replacement library and subblock statistics (single-gpu) + [2025-11-02 12:12:21][rank-0][puzzletron_nas_plugin.py:222] Puzzletron Progress 6/8: calculating one block scores (multi-gpu) + [2025-11-02 12:50:41][rank-0][puzzletron_nas_plugin.py:226] Puzzletron Progress 7/8: running MIP and realizing models (multi-gpu) + [2025-11-02 12:52:34][rank-0][main.py:115] Puzzletron Progress 8/8: puzzletron pipeline completed (multi-gpu) ``` Once the process is complete, the resulting network architecture will be recorded in `log.txt` for your review: @@ -132,7 +132,7 @@ This assumes pruning, replacement library building, NAS scoring, and subblock st For example, let's set `target_memory: 96_000` in `llama-3_1-8B_pruneffn_memory.yaml`. ```bash -torchrun --nproc_per_node 2 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress" +torchrun --nproc_per_node 2 examples/puzzletron/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Puzzletron Progress" ``` This will generate the following network architecture (see `log.txt`): diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml similarity index 100% rename from examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml similarity index 72% rename from examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml index c9a0cabf3..20eec970e 100644 --- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml @@ -8,14 +8,14 @@ input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct # Dataset path for pruning and NAS scoring dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 -# Working directory for compression outputs +# Working directory for puzzletron outputs puzzle_dir: /workspace/puzzle_dir -# MIP memory constraint (in MiB) +# MIP memory constraint (in MiB) mip: human_constraints: target_memory: 78_000 # 78 GiB # FFN intermediate sizes to search over (heterogeneous architecture) pruning: - intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336 + intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336 diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml similarity index 100% rename from examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/ffn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/ffn_pruning.yaml similarity index 100% rename from examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/ffn_pruning.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/ffn_pruning.yaml diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/hidden_dim_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/hidden_dim_pruning.yaml similarity index 100% rename from examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/hidden_dim_pruning.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/hidden_dim_pruning.yaml diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml similarity index 100% rename from examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml similarity index 80% rename from examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml index 202af6eb0..ce1749d96 100644 --- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml @@ -14,4 +14,4 @@ write_results: false calc_losses_on_cpu: false activations_log_dir: model_name_or_path: -load_dataset_fn: ${get_object:modelopt.torch._compress.utils.data.dataloaders.load_from_disk_fn} +load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn} diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_solutions_defaults.yaml similarity index 100% rename from examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_solutions_defaults.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_solutions_defaults.yaml diff --git a/examples/compress/main.py b/examples/puzzletron/main.py similarity index 78% rename from examples/compress/main.py rename to examples/puzzletron/main.py index 2c3343c37..16d4de385 100644 --- a/examples/compress/main.py +++ b/examples/puzzletron/main.py @@ -14,14 +14,14 @@ # limitations under the License. """ -Main script for running the compress algorithm on large language models (based on Puzzle paper https://arxiv.org/abs/2411.19146). +Main script for running the puzzletron algorithm on large language models (based on Puzzle paper https://arxiv.org/abs/2411.19146). This script provides two modes: -1. Default mode: Runs the full compression pipeline +1. Default mode: Runs the full puzzletron pipeline 2. MIP-only mode: Runs only the MIP search and realize models phase Usage: - # Full compression pipeline + # Full puzzletron pipeline torchrun main.py --config ./configs/llama_3.2_1B_pruneffn_memory.yaml # Only MIP search and realize models phase @@ -32,21 +32,21 @@ from datetime import timedelta from pathlib import Path -import modelopt.torch._compress.mip.mip_and_realize_models as mip_and_realize_models import modelopt.torch.nas as mtn +import modelopt.torch.puzzletron.mip.mip_and_realize_models as mip_and_realize_models import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel -from modelopt.torch._compress.tools.hydra_utils import ( +from modelopt.torch.puzzletron.nas.plugins.puzzletron_nas_plugin import PuzzletronModel +from modelopt.torch.puzzletron.tools.hydra_utils import ( initialize_hydra_config_for_dir, register_hydra_resolvers, ) -from modelopt.torch._compress.tools.logger import mprint +from modelopt.torch.puzzletron.tools.logger import mprint def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser( - description="Compress large language models using the Compress algorithm (based on Puzzle paper https://arxiv.org/abs/2411.19146)" + description="Compress large language models using the Puzzletron algorithm (based on Puzzle paper https://arxiv.org/abs/2411.19146)" ) parser.add_argument( "--config", @@ -63,13 +63,13 @@ def parse_args(): return parser.parse_args() -def run_full_compress(hydra_config_path: str): - """Run the full compression pipeline. +def run_full_puzzletron(hydra_config_path: str): + """Run the full puzzletron pipeline. Args: config_path: Path to the YAML configuration file """ - mprint("Compress Progress 1/8: starting compression pipeline") + mprint("Puzzletron Progress 1/8: starting puzzletron pipeline") dist.setup(timeout=timedelta(10)) # Register Hydra custom resolvers (needed for config resolution) @@ -88,12 +88,12 @@ def run_full_compress(hydra_config_path: str): # Convert model (convert from HF to DeciLM, score pruning activations, # prune the model and save pruned checkpoints) - input_model = CompressModel() + input_model = PuzzletronModel() converted_model = mtn.convert( input_model, mode=[ ( - "compress", + "puzzletron", { "puzzle_dir": str(hydra_cfg.puzzle_dir), "input_model_path": hydra_cfg.input_hf_model_path, @@ -115,7 +115,7 @@ def run_full_compress(hydra_config_path: str): ) dist.cleanup() - mprint("Compress Progress 8/8: compression pipeline completed (multi-gpu)") + mprint("Puzzletron Progress 8/8: puzzletron pipeline completed (multi-gpu)") def run_mip_only(hydra_config_path: str): @@ -144,12 +144,12 @@ def run_mip_only(hydra_config_path: str): ) # mip_and_realize_models (distributed processing) - # TODO: How to make it part of mnt.search() api, similarly to run_full_compress() API - mprint("Compress Progress 7/8: running MIP and realizing models (multi-gpu)") + # TODO: How to make it part of mnt.search() api, similarly to run_full_puzzletron() API + mprint("Puzzletron Progress 7/8: running MIP and realizing models (multi-gpu)") mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg) dist.cleanup() - mprint("Compress Progress 8/8: compression pipeline completed (multi-gpu)") + mprint("Puzzletron Progress 8/8: puzzletron pipeline completed (multi-gpu)") def main(): @@ -158,7 +158,7 @@ def main(): if args.mip_only: run_mip_only(hydra_config_path=args.config) else: - run_full_compress(hydra_config_path=args.config) + run_full_puzzletron(hydra_config_path=args.config) if __name__ == "__main__": diff --git a/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py b/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py index bfc9b9290..56436acfd 100644 --- a/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py +++ b/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py @@ -26,8 +26,8 @@ from torch import nn import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.tools.logger import aprint -from modelopt.torch._compress.tools.robust_json import json_dump +from modelopt.torch.puzzletron.tools.logger import aprint +from modelopt.torch.puzzletron.tools.robust_json import json_dump __all__ = [ "ForwardHook", diff --git a/modelopt/torch/_compress/README.md b/modelopt/torch/puzzletron/README.md similarity index 100% rename from modelopt/torch/_compress/README.md rename to modelopt/torch/puzzletron/README.md diff --git a/modelopt/torch/_compress/__init__.py b/modelopt/torch/puzzletron/__init__.py similarity index 100% rename from modelopt/torch/_compress/__init__.py rename to modelopt/torch/puzzletron/__init__.py diff --git a/modelopt/torch/_compress/activation_scoring/activation_hooks/__init__.py b/modelopt/torch/puzzletron/activation_scoring/activation_hooks/__init__.py similarity index 100% rename from modelopt/torch/_compress/activation_scoring/activation_hooks/__init__.py rename to modelopt/torch/puzzletron/activation_scoring/activation_hooks/__init__.py diff --git a/modelopt/torch/_compress/activation_scoring/activation_hooks/utils.py b/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py similarity index 97% rename from modelopt/torch/_compress/activation_scoring/activation_hooks/utils.py rename to modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py index 931ac762f..ab7eed2ac 100644 --- a/modelopt/torch/_compress/activation_scoring/activation_hooks/utils.py +++ b/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py @@ -15,11 +15,11 @@ # mypy: ignore-errors """Provides a function to register activation hooks for a model. -Activation hooks are used to compute activation scores for pruning.""" +Activation hooks are used to compute activation scores for pruning. +""" import re -from modelopt.torch._compress.decilm.deci_lm_hf_code.modeling_decilm import DeciLMForCausalLM from modelopt.torch.nas.plugins.megatron_hooks.base_hooks import ( ForwardHook, IndependentChannelContributionHook, @@ -27,6 +27,7 @@ IterativeChannelContributionHook, LayerNormContributionHook, ) +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.modeling_decilm import DeciLMForCausalLM def register_activation_hooks( diff --git a/modelopt/torch/_compress/activation_scoring/score_pruning_activations.py b/modelopt/torch/puzzletron/activation_scoring/score_pruning_activations.py similarity index 92% rename from modelopt/torch/_compress/activation_scoring/score_pruning_activations.py rename to modelopt/torch/puzzletron/activation_scoring/score_pruning_activations.py index f271a5f4f..ef5e5e9ad 100644 --- a/modelopt/torch/_compress/activation_scoring/score_pruning_activations.py +++ b/modelopt/torch/puzzletron/activation_scoring/score_pruning_activations.py @@ -19,13 +19,12 @@ from omegaconf import DictConfig import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.tools.validate_model import validate_model +from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.tools.validate_model import validate_model def has_checkpoint_support(activation_hooks_kwargs: dict) -> bool: - """ - Determine if the activation hook method has proper checkpoint support implemented. + """Determine if the activation hook method has proper checkpoint support implemented. Args: activation_hooks_kwargs: Hook configuration @@ -47,8 +46,7 @@ def has_checkpoint_support(activation_hooks_kwargs: dict) -> bool: def check_scoring_completion(activations_log_dir: str, activation_hooks_kwargs=None) -> bool: - """ - Check if scoring is already completed by looking for the expected output files. + """Check if scoring is already completed by looking for the expected output files. Also checks if the scoring method is safe for resume. Args: @@ -89,8 +87,7 @@ def check_scoring_completion(activations_log_dir: str, activation_hooks_kwargs=N def should_skip_scoring_completely(cfg: DictConfig) -> bool: - """ - Determine if we should skip scoring entirely (only if 100% complete). + """Determine if we should skip scoring entirely (only if 100% complete). Partial progress should proceed to validate_model for proper resume. Args: diff --git a/modelopt/torch/_compress/build_library_and_stats.py b/modelopt/torch/puzzletron/build_library_and_stats.py similarity index 78% rename from modelopt/torch/_compress/build_library_and_stats.py rename to modelopt/torch/puzzletron/build_library_and_stats.py index 28e0f386c..5f04f6049 100644 --- a/modelopt/torch/_compress/build_library_and_stats.py +++ b/modelopt/torch/puzzletron/build_library_and_stats.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Unified command that runs build_replacement_library followed by calc_subblock_stats. +"""Unified command that runs build_replacement_library followed by calc_subblock_stats. This script combines the functionality of both commands into a single workflow: 1. First, it builds the replacement library for the puzzle @@ -23,27 +22,23 @@ Usage: - python modelopt.torch._compress.build_library_and_stats.py --config-dir configs --config-name Llama-3_1-8B puzzle_dir=/path/to/puzzle/dir dataset_path=/path/to/dataset + python modelopt.torch.puzzletron.build_library_and_stats.py --config-dir configs --config-name Llama-3_1-8B puzzle_dir=/path/to/puzzle/dir dataset_path=/path/to/dataset The script uses the same Hydra configuration as the individual commands and supports all the same configuration parameters for both build_replacement_library and calc_subblock_stats. """ -import hydra from omegaconf import DictConfig -from modelopt.torch._compress.replacement_library.build_replacement_library import ( +from modelopt.torch.puzzletron.replacement_library.build_replacement_library import ( launch_build_replacement_library, ) -from modelopt.torch._compress.subblock_stats.calc_subblock_stats import launch_calc_subblock_stats -from modelopt.torch._compress.tools.hydra_utils import register_hydra_resolvers -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.utils.parsing import format_global_config +from modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats import launch_calc_subblock_stats +from modelopt.torch.puzzletron.tools.logger import mprint def launch_build_library_and_stats(cfg: DictConfig) -> None: - """ - Launch both build_replacement_library and calc_subblock_stats in sequence. + """Launch both build_replacement_library and calc_subblock_stats in sequence. Args: cfg: Hydra configuration containing settings for both commands diff --git a/modelopt/torch/_compress/dataset/__init__.py b/modelopt/torch/puzzletron/dataset/__init__.py similarity index 100% rename from modelopt/torch/_compress/dataset/__init__.py rename to modelopt/torch/puzzletron/dataset/__init__.py diff --git a/modelopt/torch/_compress/dataset/prepare_dataset.py b/modelopt/torch/puzzletron/dataset/prepare_dataset.py similarity index 97% rename from modelopt/torch/_compress/dataset/prepare_dataset.py rename to modelopt/torch/puzzletron/dataset/prepare_dataset.py index 072640777..6f1749697 100644 --- a/modelopt/torch/_compress/dataset/prepare_dataset.py +++ b/modelopt/torch/puzzletron/dataset/prepare_dataset.py @@ -19,7 +19,7 @@ import fire import numpy as np -from modelopt.torch._compress.tools.logger import mprint +from modelopt.torch.puzzletron.tools.logger import mprint def process_and_save_dataset( diff --git a/modelopt/torch/_compress/decilm/conversion_utils.py b/modelopt/torch/puzzletron/decilm/conversion_utils.py similarity index 100% rename from modelopt/torch/_compress/decilm/conversion_utils.py rename to modelopt/torch/puzzletron/decilm/conversion_utils.py diff --git a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py b/modelopt/torch/puzzletron/decilm/converters/convert_llama3_to_decilm.py similarity index 93% rename from modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py rename to modelopt/torch/puzzletron/decilm/converters/convert_llama3_to_decilm.py index 4df9f009a..c5f107ea1 100644 --- a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py +++ b/modelopt/torch/puzzletron/decilm/converters/convert_llama3_to_decilm.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Convert a Llama3 model to a DeciLM model.""" +"""Convert a Llama3 model to a DeciLM model.""" #!/usr/bin/env python3 from pathlib import Path @@ -23,10 +22,10 @@ from fire import Fire from transformers import LlamaConfig -from modelopt.torch._compress.decilm.conversion_utils import convert_model_weights_to_decilm -from modelopt.torch._compress.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig -from modelopt.torch._compress.tools.checkpoint_utils import copy_tokenizer -from modelopt.torch._compress.tools.checkpoint_utils_hf import copy_deci_lm_hf_code +from modelopt.torch.puzzletron.decilm.conversion_utils import convert_model_weights_to_decilm +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig +from modelopt.torch.puzzletron.tools.checkpoint_utils import copy_tokenizer +from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import copy_deci_lm_hf_code """ example: diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/__init__.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/__init__.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/__init__.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/__init__.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/block_config.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/block_config.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/block_config.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/block_config.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/configuration_decilm.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/configuration_decilm.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/configuration_decilm.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/configuration_decilm.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/megatron_lm__mamba_mixer.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/megatron_lm__mamba_mixer.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/megatron_lm__mamba_mixer.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/megatron_lm__mamba_mixer.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/megatron_lm__megatron_tokenizer.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/megatron_lm__megatron_tokenizer.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/megatron_lm__megatron_tokenizer.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/megatron_lm__megatron_tokenizer.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/megatron_lm__tokenizer.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/megatron_lm__tokenizer.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/megatron_lm__tokenizer.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/megatron_lm__tokenizer.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/modeling_decilm.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/modeling_decilm.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/modeling_decilm.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/modeling_decilm.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/tokenization_decilm.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/tokenization_decilm.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/tokenization_decilm.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/tokenization_decilm.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__activations.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__activations.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__activations.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__activations.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__cache_utils.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__cache_utils.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__cache_utils.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__cache_utils.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__configuration_llama.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__configuration_llama.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__configuration_llama.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__configuration_llama.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_attn_mask_utils.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_attn_mask_utils.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_attn_mask_utils.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_attn_mask_utils.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_flash_attention_utils_backward_compat.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_flash_attention_utils_backward_compat.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_flash_attention_utils_backward_compat.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_flash_attention_utils_backward_compat.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_outputs.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_outputs.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_outputs.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_outputs.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_rope_utils.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_rope_utils.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_rope_utils.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__modeling_rope_utils.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__pytorch_utils.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__pytorch_utils.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_44_2__pytorch_utils.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_44_2__pytorch_utils.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_51_3__cache_utils.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_51_3__cache_utils.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_51_3__cache_utils.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_51_3__cache_utils.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_51_3__configuration_llama4.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_51_3__configuration_llama4.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_51_3__configuration_llama4.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_51_3__configuration_llama4.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_51_3__modeling_llama4_attention.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_51_3__modeling_llama4_attention.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/transformers_4_51_3__modeling_llama4_attention.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_4_51_3__modeling_llama4_attention.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/variable_cache.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/variable_cache.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/variable_cache.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/variable_cache.py diff --git a/modelopt/torch/_compress/decilm/deci_lm_hf_code/vllm_yarn_utils.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/vllm_yarn_utils.py similarity index 100% rename from modelopt/torch/_compress/decilm/deci_lm_hf_code/vllm_yarn_utils.py rename to modelopt/torch/puzzletron/decilm/deci_lm_hf_code/vllm_yarn_utils.py diff --git a/modelopt/torch/_compress/mip/mip_and_realize_models.py b/modelopt/torch/puzzletron/mip/mip_and_realize_models.py similarity index 89% rename from modelopt/torch/_compress/mip/mip_and_realize_models.py rename to modelopt/torch/puzzletron/mip/mip_and_realize_models.py index a3a1a84b9..e241021ec 100644 --- a/modelopt/torch/_compress/mip/mip_and_realize_models.py +++ b/modelopt/torch/puzzletron/mip/mip_and_realize_models.py @@ -17,20 +17,19 @@ # mypy: ignore-errors from pathlib import Path -from typing import List import torch from omegaconf import DictConfig import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.mip.run_puzzle import run_puzzle -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.tools.validate_puzzle_with_multi_replacements import ( +from modelopt.torch.puzzletron.mip.run_puzzle import run_puzzle +from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.tools.validate_puzzle_with_multi_replacements import ( validate_puzzle_solutions, ) -def launch_mip(cfg: DictConfig) -> List[str]: +def launch_mip(cfg: DictConfig) -> list[str]: solution_paths = run_puzzle(args=cfg.mip) return solution_paths diff --git a/modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py b/modelopt/torch/puzzletron/mip/mip_with_multi_layer_replacements.py similarity index 95% rename from modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py rename to modelopt/torch/puzzletron/mip/mip_with_multi_layer_replacements.py index 438db3312..5b4eccbc1 100644 --- a/modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py +++ b/modelopt/torch/puzzletron/mip/mip_with_multi_layer_replacements.py @@ -19,14 +19,14 @@ import math import warnings from collections import defaultdict +from collections.abc import Hashable, Iterable from copy import deepcopy from random import random -from typing import Any, Hashable, Iterable, Optional, TypeAlias +from typing import Any, TypeAlias from mip import BINARY, Model, maximize, minimize, xsum -from modelopt.torch._compress.mip.utils import ( - InfeasibleError, +from modelopt.torch.puzzletron.mip.utils import ( consecutive_ngrams, get_nested_key, sort_replacements, @@ -42,7 +42,7 @@ def run_mip( objective: str, constraints: dict[str, float], bigger_is_better: bool, - max_seconds_per_solution: Optional[float] = None, + max_seconds_per_solution: float | None = None, ) -> tuple[ChosenReplacements, float, dict[str, float]]: orig_num_replacements = len(replacements) replacements = { @@ -60,7 +60,7 @@ def run_mip( mip_model = Model() objective_vars = [] - constraint_vars = {constraint_key: [] for constraint_key in constraints.keys()} + constraint_vars = {constraint_key: [] for constraint_key in constraints} choice_indicators_by_layer = defaultdict(list) for replacement_id, replacement in replacements.items(): is_chosen = mip_model.add_var(var_type=BINARY) @@ -71,7 +71,7 @@ def run_mip( objective_vars.append(is_chosen * get_nested_key(replacement, objective)) - for constraint_key in constraints.keys(): + for constraint_key in constraints: constraint_vars[constraint_key].append( is_chosen * get_nested_key(replacement, constraint_key) ) @@ -107,7 +107,7 @@ def run_mip( # Trust But Verify: calculate total value and costs, and check that all the constraints are filled total_value = 0.0 - total_costs = {constraint_key: 0 for constraint_key in constraints.keys()} + total_costs = dict.fromkeys(constraints.keys(), 0) chosen_replacements: ChosenReplacements = [] chosen_layers = [] for replacement_id, replacement in replacements.items(): @@ -116,7 +116,7 @@ def run_mip( assert replacement not in chosen_replacements chosen_replacements.append(replacement) total_value += get_nested_key(replacement, objective) - for constraint_key in constraints.keys(): + for constraint_key in constraints: total_costs[constraint_key] += get_nested_key(replacement, constraint_key) for parent_layer_idx in replacement["parent_layer_indices"]: assert parent_layer_idx not in chosen_layers diff --git a/modelopt/torch/_compress/mip/run_puzzle.py b/modelopt/torch/puzzletron/mip/run_puzzle.py similarity index 96% rename from modelopt/torch/_compress/mip/run_puzzle.py rename to modelopt/torch/puzzletron/mip/run_puzzle.py index 4868479e2..72919d27c 100644 --- a/modelopt/torch/_compress/mip/run_puzzle.py +++ b/modelopt/torch/puzzletron/mip/run_puzzle.py @@ -20,32 +20,33 @@ import dataclasses import enum import json +from collections.abc import Hashable, Iterable from copy import deepcopy from pathlib import Path -from typing import Any, Hashable, Iterable, List, Literal, TypeAlias +from typing import Any, Literal, TypeAlias import numpy as np import yaml from omegaconf import DictConfig, ListConfig, OmegaConf -from modelopt.torch._compress.decilm.deci_lm_hf_code.block_config import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( AttentionConfig, BlockConfig, FFNConfig, ) -from modelopt.torch._compress.mip.mip_with_multi_layer_replacements import ( +from modelopt.torch.puzzletron.mip.mip_with_multi_layer_replacements import ( run_mip as run_multi_layer_replacement_mip, ) -from modelopt.torch._compress.replacement_library.replacement_utils import ( +from modelopt.torch.puzzletron.replacement_library.replacement_utils import ( extract_block_configs_and_locations, parse_layer_replacement, replacement_is_teacher, ) -from modelopt.torch._compress.tools.checkpoint_utils import load_model_config -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.tools.robust_json import json_dump -from modelopt.torch._compress.utils.parsing import get_nested_key, parse_json, parse_path -from modelopt.torch._compress.utils.utils import block_config_to_str, solution_to_str +from modelopt.torch.puzzletron.tools.checkpoint_utils import load_model_config +from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.tools.robust_json import json_dump +from modelopt.torch.puzzletron.utils.parsing import get_nested_key, parse_json, parse_path +from modelopt.torch.puzzletron.utils.utils import block_config_to_str, solution_to_str """ Usage: @@ -418,7 +419,7 @@ def _assert_valid_config(args, puzzle_profile): exit(1) -def _get_minimal_unique_names(dicts: List[dict]) -> List[str]: +def _get_minimal_unique_names(dicts: list[dict]) -> list[str]: all_keys = set(k for d in dicts for k in d.keys()) all_values = {k: set(d[k] for d in dicts if k in d) for k in all_keys} non_common_keys = [k for k, values in all_values.items() if len(values) > 1] @@ -426,7 +427,7 @@ def _get_minimal_unique_names(dicts: List[dict]) -> List[str]: return ["-".join(f"{k}_{d[k]}".replace(".", "_") for k in non_common_keys) for d in dicts] -def run_puzzle(args: argparse.Namespace | DictConfig) -> List[str]: +def run_puzzle(args: argparse.Namespace | DictConfig) -> list[str]: # Loads config from args/puzzle_profile if args.puzzle_profile is not None: with open(args.puzzle_profile) as f: @@ -578,9 +579,7 @@ def _parse_teacher_block_metrics( "block_idx": block_idx, "parent_layer_indices": [block_idx], "metrics": { - **{ - metric_name: 0.0 for metric_name in all_metric_names - }, # default value 0. for teacher + **dict.fromkeys(all_metric_names, 0.0), # default value 0. for teacher **_extract_average_metrics(raw_metrics), # override with real value if exists }, **( @@ -597,7 +596,7 @@ def _parse_teacher_block_metrics( def _extract_average_metrics(raw_metrics: dict[str, dict]) -> dict[str, float]: average_metrics = dict() - for metric_name in raw_metrics.keys(): + for metric_name in raw_metrics: metric_dict = raw_metrics[metric_name] if isinstance(metric_dict, dict) and ("avg" in metric_dict.keys()): metric_value = raw_metrics[metric_name]["avg"] diff --git a/modelopt/torch/_compress/mip/utils.py b/modelopt/torch/puzzletron/mip/utils.py similarity index 100% rename from modelopt/torch/_compress/mip/utils.py rename to modelopt/torch/puzzletron/mip/utils.py diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/puzzletron/nas/plugins/puzzletron_nas_plugin.py similarity index 71% rename from modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py rename to modelopt/torch/puzzletron/nas/plugins/puzzletron_nas_plugin.py index 55b9d10b0..5e1eace93 100644 --- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py +++ b/modelopt/torch/puzzletron/nas/plugins/puzzletron_nas_plugin.py @@ -13,30 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Compress NAS plugin for the Modelopt framework (based on Puzzle algorithm: https://arxiv.org/abs/2411.19146). +"""Puzzletron NAS plugin for the Modelopt framework (based on Puzzle algorithm: https://arxiv.org/abs/2411.19146). It is used by mtn.convert() to convert a model from HF format to DeciLM format + do pruning scoring and save pruned checkpoints, and by mtn.search() to perform the MIP-based NAS search. """ -import datetime from pathlib import Path -import torch from torch import nn -import modelopt.torch._compress.mip.mip_and_realize_models as mip_and_realize_models -import modelopt.torch._compress.pruning.pruning_ckpts as pruning_ckpts -import modelopt.torch._compress.scoring.scoring as scoring +import modelopt.torch.puzzletron.mip.mip_and_realize_models as mip_and_realize_models +import modelopt.torch.puzzletron.pruning.pruning_ckpts as pruning_ckpts +import modelopt.torch.puzzletron.scoring.scoring as scoring import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress import build_library_and_stats -from modelopt.torch._compress.activation_scoring import score_pruning_activations -from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import ( - convert_llama3_to_decilm, -) -from modelopt.torch._compress.tools.hydra_utils import initialize_hydra_config_for_dir -from modelopt.torch._compress.tools.logger import mprint from modelopt.torch.nas.conversion import NASModeRegistry from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField from modelopt.torch.opt.mode import ( @@ -47,14 +37,21 @@ RestoreEntrypoint, ) from modelopt.torch.opt.searcher import BaseSearcher, SearchStateDict +from modelopt.torch.puzzletron import build_library_and_stats +from modelopt.torch.puzzletron.activation_scoring import score_pruning_activations +from modelopt.torch.puzzletron.decilm.converters.convert_llama3_to_decilm import ( + convert_llama3_to_decilm, +) +from modelopt.torch.puzzletron.tools.hydra_utils import initialize_hydra_config_for_dir +from modelopt.torch.puzzletron.tools.logger import mprint -class CompressModel(nn.Module): - pass # No model implementation is needed for the compress mode +class PuzzletronModel(nn.Module): + pass # No model implementation is needed for the puzzletron mode -class CompressConfig(ModeloptBaseConfig): - """Configuration for Compress NAS algorithm.""" +class PuzzletronConfig(ModeloptBaseConfig): + """Configuration for Puzzletron NAS algorithm.""" # Input model path to compress in the HF format input_model_path: str = ModeloptField( @@ -92,7 +89,7 @@ class CompressConfig(ModeloptBaseConfig): ) -def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertReturnType: +def convert_puzzletron_model(model: nn.Module, config: PuzzletronConfig) -> ConvertReturnType: """1. Convert the model from HF format to DeciLM format. 2. Score the pruning activations. 3. Prune the model and save pruned checkpoints @@ -118,7 +115,7 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR # Convert Llama3 model to DeciLM model # TODO: Make it generic, do not call convert_llama3_to_decilm directly. if dist.is_master(): - mprint("Compress Progress 2/8: converting model from HF to DeciLM (single-gpu)") + mprint("Puzzletron Progress 2/8: converting model from HF to DeciLM (single-gpu)") hf_ckpt_teacher_dir = "ckpts/teacher" # TODO: make it configurable convert_llama3_to_decilm( input_dir=config.input_model_path, @@ -127,13 +124,13 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR dist.barrier() # Score_pruning_activations (distributed processing) - mprint("Compress Progress 3/8: scoring pruning activations (multi-gpu)") + mprint("Puzzletron Progress 3/8: scoring pruning activations (multi-gpu)") score_pruning_activations.launch_score_activations(hydra_cfg) # Prune the model and save pruned checkpoints if dist.is_master(): mprint( - "Compress Progress 4/8: pruning the model and saving pruned checkpoints (single-gpu)" + "Puzzletron Progress 4/8: pruning the model and saving pruned checkpoints (single-gpu)" ) pruning_ckpts.launch_prune_ckpt(hydra_cfg) dist.barrier() @@ -141,58 +138,57 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR return model, {} -def restore_compress_model( - model: nn.Module, config: CompressConfig, metadata: MetadataDict +def restore_puzzletron_model( + model: nn.Module, config: PuzzletronConfig, metadata: MetadataDict ) -> nn.Module: - """Restore is not needed for the compress mode as we are not saving any model state""" + """Restore is not needed for the puzzletron mode as we are not saving any model state""" return model @NASModeRegistry.register_mode -class CompressDescriptor(ModeDescriptor): - """Descriptor for the Compress mode.""" +class PuzzletronDescriptor(ModeDescriptor): + """Descriptor for the Puzzletron mode.""" @property def name(self) -> str: """String identifier for this mode.""" - return "compress" + return "puzzletron" @property def config_class(self) -> type[ModeloptBaseConfig]: """Configuration class for this mode.""" - return CompressConfig + return PuzzletronConfig @property def search_algorithm(self) -> type[BaseSearcher]: """Return the associated searcher implementation.""" - - return CompressSearcher + return PuzzletronSearcher @property def convert(self) -> ConvertEntrypoint: """Entrypoint to convert a model.""" - return convert_compress_model + return convert_puzzletron_model @property def restore(self) -> RestoreEntrypoint: """Entrypoint to restore a model.""" - return restore_compress_model + return restore_puzzletron_model @property def export_mode(self) -> str | None: """The mode that corresponds to the export mode. For now, this will be a no-op as there is no modelopt's concept of search space defined - for the compress algorithm. + for the puzzletron algorithm. """ return "export_nas" -class CompressSearcher(BaseSearcher): - """Runs NAS search for the Compress mode.""" +class PuzzletronSearcher(BaseSearcher): + """Runs NAS search for the Puzzletron mode.""" @property def default_state_dict(self) -> SearchStateDict: - """Not needed for the compress mode as we are not saving any model state""" + """Not needed for the puzzletron mode as we are not saving any model state""" return {} def run_search(self) -> None: @@ -209,15 +205,15 @@ def run_search(self) -> None: # Build_library_and_stats (single process) if dist.is_master(): mprint( - "Compress Progress 5/8: building replacement library and subblock statistics (single-gpu)" + "Puzzletron Progress 5/8: building replacement library and subblock statistics (single-gpu)" ) build_library_and_stats.launch_build_library_and_stats(hydra_cfg) dist.barrier() # Calc_one_block_scores (distributed processing) - mprint("Compress Progress 6/8: calculating one block scores (multi-gpu)") + mprint("Puzzletron Progress 6/8: calculating one block scores (multi-gpu)") scoring.launch_scoring(hydra_cfg) # mip_and_realize_models (distributed processing) - mprint("Compress Progress 7/8: running MIP and realizing models (multi-gpu)") + mprint("Puzzletron Progress 7/8: running MIP and realizing models (multi-gpu)") mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg) diff --git a/modelopt/torch/_compress/pruning/pruning_ckpts.py b/modelopt/torch/puzzletron/pruning/pruning_ckpts.py similarity index 91% rename from modelopt/torch/_compress/pruning/pruning_ckpts.py rename to modelopt/torch/puzzletron/pruning/pruning_ckpts.py index b413a3f78..5a0dfed01 100644 --- a/modelopt/torch/_compress/pruning/pruning_ckpts.py +++ b/modelopt/torch/puzzletron/pruning/pruning_ckpts.py @@ -23,28 +23,24 @@ import json import os import time -from typing import Optional -import hydra from omegaconf import DictConfig -from modelopt.torch._compress.tools.bypassed_training.child_init import ( +from modelopt.torch.puzzletron.tools.bypassed_training.child_init import ( GQAInitMode, HiddenSizeInitMode, LinearInitMode, MlpInitMode, ) -from modelopt.torch._compress.tools.bypassed_training.init_child_from_parent import ( +from modelopt.torch.puzzletron.tools.bypassed_training.init_child_from_parent import ( init_child_from_parent, ) -from modelopt.torch._compress.tools.checkpoint_utils import load_model_config -from modelopt.torch._compress.tools.hydra_utils import register_hydra_resolvers -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.tools.validate_model import validate_model +from modelopt.torch.puzzletron.tools.checkpoint_utils import load_model_config +from modelopt.torch.puzzletron.tools.logger import mprint def launch_ffn_intermediates_prune_ckpt( - cfg: DictConfig, max_save_workers: Optional[int] = None, max_layer_workers: Optional[int] = None + cfg: DictConfig, max_save_workers: int | None = None, max_layer_workers: int | None = None ): for intermediate_size in cfg.pruning.intermediate_size_list: dirname = f"ffn_{intermediate_size}_attn_no_op" @@ -87,7 +83,7 @@ def launch_ffn_intermediates_prune_ckpt( def launch_attn_groups_prune_ckpt( - cfg: DictConfig, max_save_workers: Optional[int] = None, max_layer_workers: Optional[int] = None + cfg: DictConfig, max_save_workers: int | None = None, max_layer_workers: int | None = None ): for n_heads_in_group in cfg.pruning.n_heads_in_group_list: dirname = f"n_heads_in_group{n_heads_in_group}" @@ -154,17 +150,17 @@ def launch_hidden_dim_prune_ckpt(cfg: DictConfig): else: intermediate_sizes.append(None) - mprint(f"Teacher config:") + mprint("Teacher config:") mprint(f" - hidden_size: {parent_hidden_size}") mprint(f" - intermediate_sizes: {intermediate_sizes}") os.makedirs(os.path.join(cfg.puzzle_dir, "ckpts"), exist_ok=True) for hidden_size in cfg.pruning.hidden_size_list: - mprint(f"\n######################################################################") + mprint("\n######################################################################") mprint(f"Hidden Size = {hidden_size}") - mprint(f"######################################################################\n") + mprint("######################################################################\n") - mprint(f"Child config:") + mprint("Child config:") mprint(f" - hidden_size: {hidden_size}") # Create model config overrides with proper FFN configuration @@ -208,9 +204,9 @@ def launch_hidden_dim_prune_ckpt(cfg: DictConfig): def launch_experts_prune_ckpt( cfg: DictConfig, - max_save_workers: Optional[int] = None, - max_layer_workers: Optional[int] = None, - symlink_suffix: Optional[str] = None, + max_save_workers: int | None = None, + max_layer_workers: int | None = None, + symlink_suffix: str | None = None, ): for num_experts in cfg.pruning.num_experts_to_keep_list: dirname = f"num_experts_{num_experts}" @@ -256,7 +252,7 @@ def launch_experts_prune_ckpt( def launch_moe_ffn_intermediates_prune_ckpt( - cfg: DictConfig, max_save_workers: Optional[int] = None, max_layer_workers: Optional[int] = None + cfg: DictConfig, max_save_workers: int | None = None, max_layer_workers: int | None = None ): for intermediate_size in cfg.pruning.intermediate_size_list: dirname = f"moe_ffn_{intermediate_size}_attn_no_op" @@ -312,14 +308,14 @@ def launch_prune_ckpt(cfg: DictConfig): max_layer_workers = int(os.environ["PRUNING_LAYER_WORKERS"]) # Log optimization settings (extracted from individual pruning methods) - mprint(f"Optimization Settings:") + mprint("Optimization Settings:") mprint( f" - I/O workers (max_workers): {'auto-calculate' if max_save_workers is None else max_save_workers}" ) mprint( f" - Layer workers (max_layer_workers): {'auto-calculate' if max_layer_workers is None else max_layer_workers}" ) - mprint(f" (Override with env vars: PRUNING_IO_WORKERS, PRUNING_LAYER_WORKERS)") + mprint(" (Override with env vars: PRUNING_IO_WORKERS, PRUNING_LAYER_WORKERS)") if target_layer == "mlp.down_proj": launch_ffn_intermediates_prune_ckpt(cfg, max_save_workers, max_layer_workers) @@ -331,7 +327,7 @@ def launch_prune_ckpt(cfg: DictConfig): # Check if we should use symlink suffix for chained pruning symlink_suffix = getattr(cfg.pruning, "symlink_suffix", None) launch_experts_prune_ckpt(cfg, max_save_workers, max_layer_workers, symlink_suffix) - elif target_layer == "regex:experts\.\d+\.down_proj$": + elif target_layer == r"regex:experts\.\d+\.down_proj$": launch_moe_ffn_intermediates_prune_ckpt(cfg, max_save_workers, max_layer_workers) else: raise NotImplementedError( diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/puzzletron/puzzletron.py similarity index 74% rename from modelopt/torch/_compress/compress.py rename to modelopt/torch/puzzletron/puzzletron.py index 21e9df2af..1051fdbaf 100644 --- a/modelopt/torch/_compress/compress.py +++ b/modelopt/torch/puzzletron/puzzletron.py @@ -13,28 +13,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" - -This module provides the main compression function for a model -using MIP-based NAS search algorithm. - -""" +"""This module provides the main compression function for a model using MIP-based NAS search algorithm.""" from omegaconf import DictConfig -import modelopt.torch._compress.activation_scoring.score_pruning_activations as score_pruning_activations -import modelopt.torch._compress.build_library_and_stats as build_library_and_stats -import modelopt.torch._compress.mip.mip_and_realize_models as mip_and_realize_models -import modelopt.torch._compress.pruning.pruning_ckpts as pruning_ckpts -import modelopt.torch._compress.scoring.scoring as scoring +import modelopt.torch.puzzletron.activation_scoring.score_pruning_activations as score_pruning_activations +import modelopt.torch.puzzletron.build_library_and_stats as build_library_and_stats +import modelopt.torch.puzzletron.mip.mip_and_realize_models as mip_and_realize_models +import modelopt.torch.puzzletron.pruning.pruning_ckpts as pruning_ckpts +import modelopt.torch.puzzletron.scoring.scoring as scoring import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.tools.hydra_utils import initialize_hydra_config_for_dir +from modelopt.torch.puzzletron.tools.hydra_utils import initialize_hydra_config_for_dir -def compress( +def puzzletron( hydra_config_dir: str, hydra_config: str, puzzle_dir: str, dataset_path: str ) -> DictConfig: - """Compress a puzzletron model using the MIP-based NAS search algorithm. + """Compress a model using the MIP-based NAS search algorithm from Puzzletron. Args: hydra_config_dir (str): path to a hydra_config_dir that defines the search space @@ -45,7 +40,7 @@ def compress( Returns: Hydra config object after compressing the model. The same hydra configuration object is used across all compression steps. - @TODO: Investigate if this config object is immutable across steps and clarify + TODO: Investigate if this config object is immutable across steps and clarify """ # Step 0: Load puzzletron hydra config hydra_cfg = initialize_hydra_config_for_dir( diff --git a/modelopt/torch/_compress/replacement_library/build_replacement_library.py b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py similarity index 93% rename from modelopt/torch/_compress/replacement_library/build_replacement_library.py rename to modelopt/torch/puzzletron/replacement_library/build_replacement_library.py index 760952a60..1618aceaf 100644 --- a/modelopt/torch/_compress/replacement_library/build_replacement_library.py +++ b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py @@ -12,56 +12,40 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" -This module constructs the replacement library JSON files from a puzzle directory containing +"""This module constructs the replacement library JSON files from a puzzle directory containing multiple trained model checkpoints. It analyzes checkpoints to extract unique block and subblock configurations, builds a library of available replacements, and generates solutions for layer replacement in compressed models. The resulting replacement library can then be used by ReplacementLibrary to efficiently load models with mixed teacher/student layers. - -Standard Puzzle Usage: -====================== -python -m modelopt.torch._compress.replacement_library.build_replacement_library PUZZLE_DIR - -Teacher checkpoint dir is assumed to be inside PUZZLE_DIR/ckpts/teacher (symlink is recommended) -though you can supply an explicit --teacher_checkpoint_dir. - ---add_ffn_no_ops and --add_attention_no_ops are optional (default True), - - -Untrained puzzle run (with bypass): -=================================== -The subblock that doesn't interest you in the checkpoint should be no_op. - """ # mypy: ignore-errors import json from pathlib import Path -from typing import Any, Type +from typing import Any import pandas as pd from omegaconf import DictConfig -from modelopt.torch._compress.decilm.deci_lm_hf_code.block_config import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( AttentionConfig, BlockConfig, FFNConfig, ) -from modelopt.torch._compress.replacement_library.replacement_utils import ( +from modelopt.torch.puzzletron.replacement_library.replacement_utils import ( is_replacement_identical_to_teacher, replacement_is_teacher, sort_replacements, ) -from modelopt.torch._compress.tools.checkpoint_utils import ( +from modelopt.torch.puzzletron.tools.checkpoint_utils import ( SAFETENSORS_SUBBLOCKS_DIR_NAME, is_valid_decilm_checkpoint, load_model_config, ) -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.tools.robust_json import json_dump -from modelopt.torch._compress.utils.parsing import format_global_config -from modelopt.torch._compress.utils.utils import block_config_to_str, subblock_config_to_str +from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.tools.robust_json import json_dump +from modelopt.torch.puzzletron.utils.parsing import format_global_config +from modelopt.torch.puzzletron.utils.utils import block_config_to_str, subblock_config_to_str UNIQUE_SUBBLOCK_IDENTIFIER = ["block_config", "attention_config", "ffn_config", "block_idx"] CHECKPOINTS_DIR_NAME = "ckpts" @@ -73,8 +57,7 @@ def build_replacement_library( add_ffn_no_ops: bool = True, add_attention_no_ops: bool = True, ) -> None: - """ - For normal puzzle runs, use default values. + """For normal puzzle runs, use default values. For advanced use cases, see the Usage section. """ master_puzzle_dir = Path(master_puzzle_dir) @@ -107,9 +90,7 @@ def build_replacement_library( def launch_build_replacement_library(cfg: DictConfig) -> None: - """ - Launch the build replacement library function with Hydra configuration. - """ + """Launch the build replacement library function with Hydra configuration.""" mprint(f"Building replacement library for puzzle directory: {cfg.puzzle_dir}") mprint(f"Teacher directory: {cfg.teacher_dir}") mprint( @@ -132,8 +113,8 @@ def infer_teacher_dir( teacher_checkpoint_dir = Path(master_puzzle_dir) / CHECKPOINTS_DIR_NAME / "teacher" if not teacher_checkpoint_dir.exists(): raise ValueError( - f"You must either provide the --teacher_checkpoint_dir argument, or create a link to the " - f"teacher dir under '{{PUZZLE_DIR}}/ckpts'." + "You must either provide the --teacher_checkpoint_dir argument, or create a link to the " + "teacher dir under '{PUZZLE_DIR}/ckpts'." ) teacher_checkpoint_dir = Path(teacher_checkpoint_dir).resolve().absolute() return teacher_checkpoint_dir @@ -381,7 +362,7 @@ def _add_no_op_subblock_rows( def _get_rows_with_no_op_subblock( subblocks_df: pd.DataFrame, no_op_subblock: str -) -> tuple[pd.DataFrame, Type[AttentionConfig] | Type[FFNConfig]]: +) -> tuple[pd.DataFrame, type[AttentionConfig] | type[FFNConfig]]: other_subblock = "ffn" if no_op_subblock == "attention" else "attention" subblock_cls = AttentionConfig if no_op_subblock == "attention" else FFNConfig no_op_subblock_config = subblock_cls(no_op=True) diff --git a/modelopt/torch/_compress/replacement_library/replacement_library.py b/modelopt/torch/puzzletron/replacement_library/replacement_library.py similarity index 96% rename from modelopt/torch/_compress/replacement_library/replacement_library.py rename to modelopt/torch/puzzletron/replacement_library/replacement_library.py index 5e2fee6f0..bf6cc6636 100644 --- a/modelopt/torch/_compress/replacement_library/replacement_library.py +++ b/modelopt/torch/puzzletron/replacement_library/replacement_library.py @@ -12,8 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" -Replacement library for efficiently loading and managing layer-replaced DeciLM models. +"""Replacement library for efficiently loading and managing layer-replaced DeciLM models. - Uses replacement_utils for parsing, sorting, and analyzing layer replacement configurations """ # mypy: ignore-errors @@ -21,7 +20,6 @@ import json import re from pathlib import Path -from typing import Optional import numpy as np import torch @@ -31,21 +29,21 @@ from torch import nn import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig -from modelopt.torch._compress.decilm.deci_lm_hf_code.modeling_decilm import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.modeling_decilm import ( DeciLMDecoderLayer, DeciLMForCausalLM, DeciLMMultiDecoderLayer, DeciLMRMSNorm, LMHead, ) -from modelopt.torch._compress.replacement_library.replacement_utils import ( +from modelopt.torch.puzzletron.replacement_library.replacement_utils import ( extract_block_configs_and_locations, parse_layer_replacement, sort_replacements, weights_path_to_checkpoint_dir, ) -from modelopt.torch._compress.tools.checkpoint_utils import ( +from modelopt.torch.puzzletron.tools.checkpoint_utils import ( PTH_SUBBLOCKS_DIR_NAME, SAFETENSORS_SUBBLOCKS_DIR_NAME, infer_weights_dtype, @@ -53,7 +51,7 @@ init_module_with_state_dict, load_model_config, ) -from modelopt.torch._compress.tools.sharded_checkpoint_utils import ( +from modelopt.torch.puzzletron.tools.sharded_checkpoint_utils import ( create_dummy_model, is_in_safetensors_format, load_sharded_state_dict, @@ -64,7 +62,7 @@ class ReplacementLibrary: def __init__( self, replacement_library_path: str | Path, - model_config_overrides: Optional[dict] = None, + model_config_overrides: dict | None = None, ): self.replacement_library = self._load_replacement_library(replacement_library_path) self._ensure_all_checkpoints_are_split() @@ -223,7 +221,7 @@ def _load_layer_replacement(self, layer_replacement: dict) -> nn.ModuleList: if len(state_dict) > 0: block_indices = [ int(re.findall(r"^model\.layers\.(\d+)\.", param_name)[0]) - for param_name in state_dict.keys() + for param_name in state_dict ] assert sorted(set(block_indices)) == list( range(min(block_indices), max(block_indices) + 1) @@ -318,7 +316,7 @@ def _get_arbitrary_non_block_param(self, param_name: str) -> torch.Tensor: partial_state_dict = load_sharded_state_dict(checkpoint_dir, [param_name]) return partial_state_dict[param_name] - non_block_pth_path = checkpoint_dir / PTH_SUBBLOCKS_DIR_NAME / f"non_block.pth" + non_block_pth_path = checkpoint_dir / PTH_SUBBLOCKS_DIR_NAME / "non_block.pth" assert non_block_pth_path.exists(), _error_message_ensure_split(checkpoint_dir) non_block_state_dict = torch.load(non_block_pth_path) return non_block_state_dict[param_name] diff --git a/modelopt/torch/_compress/replacement_library/replacement_utils.py b/modelopt/torch/puzzletron/replacement_library/replacement_utils.py similarity index 91% rename from modelopt/torch/_compress/replacement_library/replacement_utils.py rename to modelopt/torch/puzzletron/replacement_library/replacement_utils.py index 331357d2b..68ba0b5fc 100644 --- a/modelopt/torch/_compress/replacement_library/replacement_utils.py +++ b/modelopt/torch/puzzletron/replacement_library/replacement_utils.py @@ -12,8 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" -This module provides helper functions for parsing, sorting, and analyzing layer replacement +"""This module provides helper functions for parsing, sorting, and analyzing layer replacement configurations used in the replacement library for model compression. """ @@ -22,9 +21,9 @@ from copy import deepcopy from pathlib import Path -from modelopt.torch._compress.decilm.deci_lm_hf_code.block_config import BlockConfig -from modelopt.torch._compress.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig -from modelopt.torch._compress.mip.utils import sort_replacements +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig +from modelopt.torch.puzzletron.mip.utils import sort_replacements def parse_layer_replacement(layer_replacement: dict | str) -> dict: @@ -44,7 +43,7 @@ def parse_layer_replacement(layer_replacement: dict | str) -> dict: return layer_replacement -# sort_replacements moved to modelopt.torch._compress.mip.utils and imported above +# sort_replacements moved to modelopt.torch.puzzletron.mip.utils and imported above def extract_block_configs_and_locations( diff --git a/modelopt/torch/_compress/scoring/scoring.py b/modelopt/torch/puzzletron/scoring/scoring.py similarity index 91% rename from modelopt/torch/_compress/scoring/scoring.py rename to modelopt/torch/puzzletron/scoring/scoring.py index 5f745b399..8f1871de8 100644 --- a/modelopt/torch/_compress/scoring/scoring.py +++ b/modelopt/torch/puzzletron/scoring/scoring.py @@ -19,18 +19,16 @@ import os import re from glob import glob -from pathlib import Path import hydra import numpy as np import pandas as pd -import torch from omegaconf import DictConfig import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.tools.hydra_utils import register_hydra_resolvers -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.tools.validate_puzzle_with_multi_replacements import ( +from modelopt.torch.puzzletron.tools.hydra_utils import register_hydra_resolvers +from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.tools.validate_puzzle_with_multi_replacements import ( validate_puzzle_solutions, ) diff --git a/modelopt/torch/_compress/sewing_kit/__init__.py b/modelopt/torch/puzzletron/sewing_kit/__init__.py similarity index 100% rename from modelopt/torch/_compress/sewing_kit/__init__.py rename to modelopt/torch/puzzletron/sewing_kit/__init__.py diff --git a/modelopt/torch/_compress/sewing_kit/core.py b/modelopt/torch/puzzletron/sewing_kit/core.py similarity index 99% rename from modelopt/torch/_compress/sewing_kit/core.py rename to modelopt/torch/puzzletron/sewing_kit/core.py index 8f926954b..41eaeee75 100644 --- a/modelopt/torch/_compress/sewing_kit/core.py +++ b/modelopt/torch/puzzletron/sewing_kit/core.py @@ -197,6 +197,8 @@ def output( @dataclass class ExternalTarget(TargetWithNamedInputs, TargetWithNamedOutputs, metaclass=Singleton): + """External target for stitched modules.""" + @override def __hash__(self) -> int: return super().__hash__() diff --git a/modelopt/torch/_compress/sewing_kit/passage/__init__.py b/modelopt/torch/puzzletron/sewing_kit/passage/__init__.py similarity index 100% rename from modelopt/torch/_compress/sewing_kit/passage/__init__.py rename to modelopt/torch/puzzletron/sewing_kit/passage/__init__.py diff --git a/modelopt/torch/_compress/sewing_kit/passage/core.py b/modelopt/torch/puzzletron/sewing_kit/passage/core.py similarity index 99% rename from modelopt/torch/_compress/sewing_kit/passage/core.py rename to modelopt/torch/puzzletron/sewing_kit/passage/core.py index 22c720b50..c0fcb4b12 100644 --- a/modelopt/torch/_compress/sewing_kit/passage/core.py +++ b/modelopt/torch/puzzletron/sewing_kit/passage/core.py @@ -36,6 +36,8 @@ @dataclass class InputArgs: + """Container for input arguments to modules.""" + args: list[Any] kwargs: dict[str, Any] diff --git a/modelopt/torch/_compress/sewing_kit/utils.py b/modelopt/torch/puzzletron/sewing_kit/utils.py similarity index 100% rename from modelopt/torch/_compress/sewing_kit/utils.py rename to modelopt/torch/puzzletron/sewing_kit/utils.py diff --git a/modelopt/torch/_compress/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py similarity index 95% rename from modelopt/torch/_compress/subblock_stats/calc_subblock_params_and_memory.py rename to modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py index e25c8e38d..2e8630bc9 100644 --- a/modelopt/torch/_compress/subblock_stats/calc_subblock_params_and_memory.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py @@ -28,14 +28,14 @@ import numpy as np import torch -from modelopt.torch._compress.decilm.deci_lm_hf_code.block_config import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( AttentionConfig, FFNConfig, MambaConfig, ) -from modelopt.torch._compress.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig -from modelopt.torch._compress.decilm.deci_lm_hf_code.modeling_decilm import DeciLMMoe -from modelopt.torch._compress.utils.utils import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.modeling_decilm import DeciLMMoe +from modelopt.torch.puzzletron.utils.utils import ( calculate_kv_dim, raise_unknown_subblock_config_error, sizeof_dtype, @@ -117,7 +117,7 @@ def calc_subblock_active_params( def load_moe_stats(stats_file: str) -> dict: - with open(stats_file, "r") as f: + with open(stats_file) as f: stats = json.load(f) return [np.array(l) / np.sum(l) if len(l) > 0 else 0 for l in stats] @@ -178,10 +178,9 @@ def calculate_attention_memory( kv_cache_dtype: torch.dtype, allocate_prefill_query: bool, ) -> dict[str, float]: - """ - allocate_prefill_query: infery-llm style. - Infery used a unified Wqkv matrix, so before extracting the kv-cache, - the query also had to be kept in-memory, once per layer. + """allocate_prefill_query: infery-llm style. + Infery used a unified Wqkv matrix, so before extracting the kv-cache, + the query also had to be kept in-memory, once per layer. """ seq_len = prefill_seq_len + generation_seq_len if ( diff --git a/modelopt/torch/_compress/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py similarity index 93% rename from modelopt/torch/_compress/subblock_stats/calc_subblock_stats.py rename to modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index 76e6c3428..07597eb5c 100644 --- a/modelopt/torch/_compress/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -19,10 +19,11 @@ import dataclasses import json import os +from collections.abc import Iterable from functools import partial from itertools import product from pathlib import Path -from typing import Iterable, Optional, Type, TypeVar +from typing import TypeVar os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" @@ -32,38 +33,29 @@ from omegaconf import DictConfig, ListConfig, OmegaConf from tqdm import tqdm -from modelopt.torch._compress.decilm.deci_lm_hf_code.block_config import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( AttentionConfig, BlockConfig, FFNConfig, SubblockConfig, ) -from modelopt.torch._compress.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig -from modelopt.torch._compress.replacement_library.replacement_utils import parse_layer_replacement -from modelopt.torch._compress.subblock_stats.calc_subblock_params_and_memory import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig +from modelopt.torch.puzzletron.replacement_library.replacement_utils import parse_layer_replacement +from modelopt.torch.puzzletron.subblock_stats.calc_subblock_params_and_memory import ( calc_subblock_active_params, calculate_non_block_memory, calculate_non_block_params, calculate_subblock_memory, calculate_subblock_params, ) -from modelopt.torch._compress.tools.checkpoint_utils import load_model_config -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.tools.robust_json import json_dump -from modelopt.torch._compress.utils.parsing import format_global_config +from modelopt.torch.puzzletron.tools.checkpoint_utils import load_model_config +from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.tools.robust_json import json_dump +from modelopt.torch.puzzletron.utils.parsing import format_global_config # Type variable for dataclasses T_DataClass = TypeVar("T_DataClass") -""" -Usage: -python -m modelopt.torch._compress.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --benchmark_iterations 1000 ] - ---benchmark_iterations=None (the default) means that the code won't use infery to benchmark runtime, - only memory stats will be calculated. If you want to benchmark runtime, run inside an infery-llm docker. - -""" - def calculate_subblock_stats( calc_subblock_stats_config: DictConfig, @@ -77,7 +69,7 @@ def calculate_subblock_stats( n_embd: int, n_head: int, vocab_size: int, - benchmark_iterations: Optional[int], + benchmark_iterations: int | None, use_cuda_graph: bool, weights_dtype: torch.dtype, activations_dtype: torch.dtype, @@ -189,7 +181,6 @@ def calculate_subblock_stats( ) if is_calc_runtime: - pass # TODO: fix # from puzzle_tools.calc_subblock_runtime import measure_non_block_runtime_ms # non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = \ @@ -215,9 +206,7 @@ def calculate_subblock_stats( def launch_calc_subblock_stats(cfg: DictConfig) -> None: - """ - Launch the calc subblock stats function with Hydra configuration. - """ + """Launch the calc subblock stats function with Hydra configuration.""" mprint(f"Calculating subblock stats for puzzle directory: {cfg.puzzle_dir}") mprint(f"Teacher directory: {cfg.teacher_dir}") mprint( @@ -456,7 +445,7 @@ def _load_subblock_configs_from_replacement_library( return subblock_configs -T_DataClass: TypeVar = Type[dataclasses.dataclass] +T_DataClass: TypeVar = type[dataclasses.dataclass] def _dataclass_from_dict( @@ -523,10 +512,7 @@ def _find_corresponding_bf16_stats(args: dict, subblock_stats: list[dict]) -> di stats for stats in subblock_stats if all( - [ - stats["args"][key] == corresponding_bf16_args[key] - for key in corresponding_bf16_args.keys() - ] + [stats["args"][key] == corresponding_bf16_args[key] for key in corresponding_bf16_args] ) ] if len(matching_bf16_stats) == 0: diff --git a/modelopt/torch/_compress/tools/__init__.py b/modelopt/torch/puzzletron/tools/__init__.py similarity index 100% rename from modelopt/torch/_compress/tools/__init__.py rename to modelopt/torch/puzzletron/tools/__init__.py diff --git a/modelopt/torch/_compress/tools/bypassed_training/child_init.py b/modelopt/torch/puzzletron/tools/bypassed_training/child_init.py similarity index 95% rename from modelopt/torch/_compress/tools/bypassed_training/child_init.py rename to modelopt/torch/puzzletron/tools/bypassed_training/child_init.py index 1bd36fa09..3981b62e3 100644 --- a/modelopt/torch/_compress/tools/bypassed_training/child_init.py +++ b/modelopt/torch/puzzletron/tools/bypassed_training/child_init.py @@ -22,23 +22,24 @@ import os import re import time +from collections.abc import Callable from copy import deepcopy from enum import Enum from functools import partial from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any import torch from typeguard import check_type -from modelopt.torch._compress.decilm.deci_lm_hf_code.block_config import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( SUBBLOCK_CLS_DICT, BlockConfig, _get_dataclass_type, _is_dataclass_type, ) -from modelopt.torch._compress.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig -from modelopt.torch._compress.tools.logger import aprint, mprint +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig +from modelopt.torch.puzzletron.tools.logger import aprint, mprint class GQAInitMode(Enum): @@ -92,16 +93,15 @@ def _process_single_layer( new_config: DeciLMConfig, gqa_init_mode: GQAInitMode, mlp_init_mode: MlpInitMode, - mlp_init_config: Optional[dict[str, Any]], + mlp_init_config: dict[str, Any] | None, linear_init_mode: LinearInitMode, ignored_keys: set, keys: dict, is_original_mha: bool, head_size: int, hidden_size: int, -) -> Tuple[Dict[str, torch.Tensor], Dict[str, str]]: - """ - Process a single layer in parallel. Returns (layer_state_dict, keys_to_remove). +) -> tuple[dict[str, torch.Tensor], dict[str, str]]: + """Process a single layer in parallel. Returns (layer_state_dict, keys_to_remove). Thread-safe function for parallel layer processing. """ layer_out_state_dict = {} @@ -119,13 +119,13 @@ def _process_single_layer( o_key = f"{attn_prefix}.o_proj.{part}" attn_keys = [q_key, k_key, v_key, o_key] # Drop attn keys that don't exist and required to be in the new state_dict - attn_keys = [key for key in attn_keys if key in new_state_dict.keys()] + attn_keys = [key for key in attn_keys if key in new_state_dict] if len(attn_keys) > 0 and all(key in keys for key in attn_keys): for key in attn_keys: keys_to_remove[key] = keys[key] if all(key not in ignored_keys for key in attn_keys): is_student_and_teacher_have_same_attention_implementation = all( - key in new_state_dict.keys() for key in attn_keys + key in new_state_dict for key in attn_keys ) if is_student_and_teacher_have_same_attention_implementation: if part == "weight": @@ -168,7 +168,7 @@ def _process_single_layer( else: linear_attn_key = f"{attn_prefix}.linear_attn.weight" - is_student_attn_replaced_with_linear = linear_attn_key in new_state_dict.keys() + is_student_attn_replaced_with_linear = linear_attn_key in new_state_dict if is_student_attn_replaced_with_linear: if linear_init_mode == LinearInitMode.Random: layer_out_state_dict[linear_attn_key] = new_state_dict[linear_attn_key] @@ -180,7 +180,7 @@ def _process_single_layer( raise ValueError(f"Unknown {linear_init_mode=}") else: # student attn random init - for new_key in new_state_dict.keys(): + for new_key in new_state_dict: if attn_prefix in new_key: layer_out_state_dict[new_key] = new_state_dict[new_key] @@ -190,7 +190,7 @@ def _process_single_layer( mlp_prefix = f"model.layers.{layer_idx}.mlp" linear_mlp_key = f"{mlp_prefix}.linear_mlp.weight" - is_student_mlp_replaced_with_linear = linear_mlp_key in new_state_dict.keys() + is_student_mlp_replaced_with_linear = linear_mlp_key in new_state_dict if is_student_mlp_replaced_with_linear: if linear_init_mode == LinearInitMode.Random: layer_out_state_dict[linear_mlp_key] = new_state_dict[linear_mlp_key] @@ -312,7 +312,7 @@ def _process_single_layer( ]: key_possibly_missing_in_student = f".{layer_idx}.{key_possibly_missing_in_student}" is_key_missing_from_student = ( - len([k for k in new_state_dict.keys() if key_possibly_missing_in_student in k]) == 0 + len([k for k in new_state_dict if key_possibly_missing_in_student in k]) == 0 ) if is_key_missing_from_student: for k in list(keys.keys()): @@ -331,12 +331,12 @@ def create_child_state_dict( gqa_init_mode: GQAInitMode, ignore_fn: IgnoreFn = default_ignore_fn, mlp_init_mode: MlpInitMode = MlpInitMode.CopyAsIs, - mlp_init_config: Optional[dict[str, Any]] = None, - owned_block_indexes: Optional[set[int]] = None, + mlp_init_config: dict[str, Any] | None = None, + owned_block_indexes: set[int] | None = None, linear_init_mode: LinearInitMode = LinearInitMode.Random, hidden_size_init_mode: HiddenSizeInitMode = HiddenSizeInitMode.CopyAsIs, - channel_importance_path: Optional[str] = None, - max_layer_workers: Optional[int] = None, # Now optional - will auto-calculate if None + channel_importance_path: str | None = None, + max_layer_workers: int | None = None, # Now optional - will auto-calculate if None ): mprint("=== Starting create_child_state_dict with optimizations ===") total_start_time = time.time() @@ -391,14 +391,14 @@ def create_child_state_dict( hidden_size = original_config.hidden_size - ignored_keys = set([key for key in original_state_dict.keys() if ignore_fn(key)]) + ignored_keys = set([key for key in original_state_dict if ignore_fn(key)]) for key in ignored_keys: aprint(f"Ignoring key {key} and taking its init from new_state_dict") out_state_dict[key] = new_state_dict[key] keys = { match.group(1) if (match := re.search(r"(h\.\d+\..*)", key)) is not None else key: key - for key in original_state_dict.keys() + for key in original_state_dict } setup_time = time.time() - setup_start_time mprint(f"Phase 1 - Setup and memory pre-allocation: {setup_time:.2f}s") @@ -527,7 +527,7 @@ def _generate_moe_keys(layer_idx: int, num_experts: int) -> tuple[str, dict[str, def _concatenate_experts_into_dense_ffn( original_state_dict: dict[str, torch.Tensor], - mlp_init_config: Optional[dict], + mlp_init_config: dict | None, hidden_size: int, layer_idx: int, child_block_config: BlockConfig, @@ -585,8 +585,7 @@ def _concatenate_experts_into_dense_ffn( "concat_dims and experts_weights must have the same keys" ) concat_routed_state_dict = { - name: torch.cat(experts_weights[name], dim=concat_dims[name]) - for name in concat_dims.keys() + name: torch.cat(experts_weights[name], dim=concat_dims[name]) for name in concat_dims } # turn the shared expert into a normal FFN. concatenate the pruned routed experts if needed. @@ -646,16 +645,16 @@ def _verify_state_dicts_match( def _init_mlp( *, - mlp_init_mode: Union[MlpInitMode, str], + mlp_init_mode: MlpInitMode | str, layer_idx: int, original_config: DeciLMConfig, - mlp_init_config: Optional[dict[str, Any]], + mlp_init_config: dict[str, Any] | None, original_state_dict: dict, new_state_dict: dict, new_config: DeciLMConfig, keys: dict[str, str], ignored_keys: set[str], - expert_idx: Optional[int] = None, + expert_idx: int | None = None, ) -> dict[str, torch.Tensor]: out_state_dict = {} @@ -680,7 +679,7 @@ def _init_mlp( projection_matrix = None for mlp_key in mlp_keys: expanded_dim = 1 if "down_proj" in mlp_key else 0 - if mlp_key in new_state_dict.keys(): + if mlp_key in new_state_dict: mlp_module_weight, pruned_filters, projection_matrix = _init_mlp_module( mlp_init_mode, expanded_dim, @@ -700,17 +699,17 @@ def _init_mlp( def _init_mlp_module( - mlp_init_mode: Union[MlpInitMode, str], + mlp_init_mode: MlpInitMode | str, expanded_dim: int, new_item: torch.Tensor, new_config: DeciLMConfig, orig_item: torch.Tensor, original_config: DeciLMConfig, - mlp_init_config: Optional[dict[str, Any]], - pruned_filters: Optional[torch.Tensor] = None, - projection_matrix: Optional[dict[str, torch.Tensor]] = None, - mlp_prefix: Optional[str] = None, -) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[dict[str, torch.Tensor]]]: + mlp_init_config: dict[str, Any] | None, + pruned_filters: torch.Tensor | None = None, + projection_matrix: dict[str, torch.Tensor] | None = None, + mlp_prefix: str | None = None, +) -> tuple[torch.Tensor, torch.Tensor | None, dict[str, torch.Tensor] | None]: if isinstance(mlp_init_mode, str): mlp_init_mode = MlpInitMode(mlp_init_mode) assert orig_item.ndim == 2, f"{orig_item.ndim=}" @@ -779,14 +778,14 @@ def _init_mlp_module( def _init_moe_module( *, - mlp_init_mode: Union[MlpInitMode, str], - mlp_init_config: Optional[dict[str, Any]], + mlp_init_mode: MlpInitMode | str, + mlp_init_config: dict[str, Any] | None, layer_idx: int, orig_router_weight: torch.Tensor, orig_experts_weights: dict[str, list[torch.Tensor]], new_router_weight: torch.Tensor, new_experts_weights: dict[str, list[torch.Tensor]], -) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[dict[str, torch.Tensor]]]: +) -> tuple[torch.Tensor, torch.Tensor | None, dict[str, torch.Tensor] | None]: if isinstance(mlp_init_mode, str): mlp_init_mode = MlpInitMode(mlp_init_mode) @@ -849,11 +848,11 @@ def _prune_experts_by_score( return result_router_weight, result_experts_weights -def _load_expert_scores(mlp_init_config: Optional[dict[str, Any]]) -> list[list[int | float]]: +def _load_expert_scores(mlp_init_config: dict[str, Any] | None) -> list[list[int | float]]: assert mlp_init_config is not None if "expert_scores_file" in mlp_init_config: expert_scores_file = mlp_init_config["expert_scores_file"] - with open(expert_scores_file, "r") as f: + with open(expert_scores_file) as f: expert_scores = json.load(f) elif "activations_log_dir" in mlp_init_config: _cache_activations_log(mlp_init_config) @@ -1111,7 +1110,7 @@ def _init_attention_biases( bias_sd["k"] = bias_sd["k"][:, 0] bias_sd["v"] = bias_sd["v"][:, 0] elif gqa_init_mode == GQAInitMode.CopyAsIs: - for key in bias_sd.keys(): + for key in bias_sd: assert new_bias_sd[key].shape == bias_sd[key].shape, ( f"({new_bias_sd[key].shape=}) != ({bias_sd[key].shape=})" ) @@ -1227,8 +1226,7 @@ def _init_linear_attn( v_key: str, o_key: str, ) -> torch.Tensor: - """ - Init a linear layer that operates like an attention layer that assigns score 1 to the current token + """Init a linear layer that operates like an attention layer that assigns score 1 to the current token and score 0 to all others: out = (Wo @ Wv) @ x """ n_embd = parent_config.hidden_size @@ -1247,9 +1245,7 @@ def _init_linear_attn( def _init_linear_mlp(teacher_mlp_state_dict: dict[str, torch.Tensor]) -> torch.Tensor: - """ - A linear layer that does (W_down @ W_up) @ x, ignoring W_gate. - """ + """A linear layer that does (W_down @ W_up) @ x, ignoring W_gate.""" if "linear_mlp.weight" in teacher_mlp_state_dict: # if the teacher itself is a linear layer return teacher_mlp_state_dict["linear_mlp.weight"] @@ -1318,8 +1314,7 @@ def _parse_model_config_overrides( model_config_overrides_json: str | dict | Path | list[dict], n_layer: int, ) -> list[dict[str, Any]]: - """ - example model_config_overrides_json: + """Example model_config_overrides_json: { "attention": [{"n_heads_in_group": 2}], "ffn": [{"intermediate_size": 14336}] @@ -1368,11 +1363,10 @@ def _apply_hidden_size_pruning( new_config: DeciLMConfig, original_config: DeciLMConfig, hidden_size_init_mode: HiddenSizeInitMode, - channel_importance_path: Optional[str] = None, - owned_block_indexes: Optional[list[int]] = None, + channel_importance_path: str | None = None, + owned_block_indexes: list[int] | None = None, ) -> dict[str, torch.Tensor]: - """ - Apply hidden size pruning to all layers that depend on hidden_size. + """Apply hidden size pruning to all layers that depend on hidden_size. This includes embeddings, layer norms, and any linear layers that haven't been handled yet. """ if isinstance(hidden_size_init_mode, str): @@ -1387,7 +1381,7 @@ def _apply_hidden_size_pruning( # Load channel ranking if needed if hidden_size_init_mode == HiddenSizeInitMode.PruneByChannelRanking: if channel_importance_path is not None: - with open(channel_importance_path, "r") as f: + with open(channel_importance_path) as f: channel_ranking = json.load(f)["channel_importance_ranking"] else: raise ValueError( @@ -1580,12 +1574,10 @@ def _prune_hidden_size_dimension( original_tensor: torch.Tensor, new_hidden_size: int, hidden_size_init_mode: HiddenSizeInitMode, - channel_ranking: Optional[list[int]] = None, + channel_ranking: list[int] | None = None, dim: int = -1, ) -> torch.Tensor: - """ - Prune a tensor along the specified dimension to match the new hidden size. - """ + """Prune a tensor along the specified dimension to match the new hidden size.""" original_size = original_tensor.shape[dim] if hidden_size_init_mode == HiddenSizeInitMode.Random: diff --git a/modelopt/torch/_compress/tools/bypassed_training/init_child_from_parent.py b/modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py similarity index 87% rename from modelopt/torch/_compress/tools/bypassed_training/init_child_from_parent.py rename to modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py index f06db92fb..46e403c5f 100644 --- a/modelopt/torch/_compress/tools/bypassed_training/init_child_from_parent.py +++ b/modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py @@ -16,16 +16,14 @@ """TODO Add description""" -import argparse import json import time -from typing import Optional import torch import yaml -from modelopt.torch._compress.decilm.deci_lm_hf_code.modeling_decilm import DeciLMForCausalLM -from modelopt.torch._compress.tools.bypassed_training.child_init import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.modeling_decilm import DeciLMForCausalLM +from modelopt.torch.puzzletron.tools.bypassed_training.child_init import ( GQAInitMode, HiddenSizeInitMode, LinearInitMode, @@ -33,16 +31,16 @@ create_child_state_dict, update_model_config, ) -from modelopt.torch._compress.tools.checkpoint_utils import ( +from modelopt.torch.puzzletron.tools.checkpoint_utils import ( copy_tokenizer, load_model_config, load_state_dict, ) -from modelopt.torch._compress.tools.checkpoint_utils_hf import ( +from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import ( _save_checkpoint, copy_deci_lm_hf_code, ) -from modelopt.torch._compress.tools.logger import mprint +from modelopt.torch.puzzletron.tools.logger import mprint """ @@ -87,7 +85,7 @@ echo "MODEL_CONFIG_OVERRIDES_JSON:" echo "${MODEL_CONFIG_OVERRIDES_JSON}" -python -m modelopt.torch._compress.tools.bypassed_training.init_child_from_parent \ +python -m modelopt.torch.puzzletron.tools.bypassed_training.init_child_from_parent \ --parent_checkpoint_dir="$PARENT_DIR" \ --model_config_overrides_json="$MODEL_CONFIG_OVERRIDES_JSON" \ --output_checkpoint_dir="$OUTPUT_DIR" \ @@ -102,15 +100,14 @@ def init_child_from_parent( output_checkpoint_dir: str, gqa_init_mode: GQAInitMode, mlp_init_mode: MlpInitMode, - mlp_init_config_yaml: Optional[str], + mlp_init_config_yaml: str | None, linear_init_mode: LinearInitMode, - hidden_size_init_mode: Optional[HiddenSizeInitMode] = None, - channel_importance_path: Optional[str] = None, - max_workers: Optional[int] = None, # Auto-calculate optimal workers if None - max_layer_workers: Optional[int] = None, # Auto-calculate optimal workers if None + hidden_size_init_mode: HiddenSizeInitMode | None = None, + channel_importance_path: str | None = None, + max_workers: int | None = None, # Auto-calculate optimal workers if None + max_layer_workers: int | None = None, # Auto-calculate optimal workers if None ) -> None: - """ - Init child models from parent models in the style of bypass training, + """Init child models from parent models in the style of bypass training, but without having to run the entire bypass pipeline. I/O Optimization Parameters: @@ -210,7 +207,7 @@ def init_child_from_parent( total_core_time = create_child_state_dict_time + save_checkpoint_time actual_layer_workers = max_layer_workers if max_layer_workers else "auto" actual_io_workers = max_workers if max_workers else "auto" - mprint(f"\n=== PROFILING SUMMARY ===") + mprint("\n=== PROFILING SUMMARY ===") mprint( f"create_child_state_dict: {create_child_state_dict_time:.2f}s ({create_child_state_dict_time / total_core_time * 100:.1f}%)" ) @@ -219,4 +216,4 @@ def init_child_from_parent( ) mprint(f"Total core processing: {total_core_time:.2f}s") mprint(f"Optimizations: I/O workers={actual_io_workers}, Layer workers={actual_layer_workers}") - mprint(f"=========================\n") + mprint("=========================\n") diff --git a/modelopt/torch/_compress/tools/checkpoint_utils.py b/modelopt/torch/puzzletron/tools/checkpoint_utils.py similarity index 92% rename from modelopt/torch/_compress/tools/checkpoint_utils.py rename to modelopt/torch/puzzletron/tools/checkpoint_utils.py index 43d3c4364..f08b89e44 100644 --- a/modelopt/torch/_compress/tools/checkpoint_utils.py +++ b/modelopt/torch/puzzletron/tools/checkpoint_utils.py @@ -14,8 +14,7 @@ # limitations under the License. # mypy: ignore-errors -""" -It provides general utilities for loading and initializing PyTorch model checkpoints, +"""It provides general utilities for loading and initializing PyTorch model checkpoints, particularly for DeciLM models. """ @@ -31,8 +30,8 @@ from transformers import AutoTokenizer from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME -from modelopt.torch._compress.tools.checkpoint_utils_hf import load_model_config -from modelopt.torch._compress.tools.common import infer_weights_dtype +from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import load_model_config +from modelopt.torch.puzzletron.tools.common import infer_weights_dtype SAFETENSORS_SUBBLOCKS_DIR_NAME = "subblocks_safetensors" PTH_SUBBLOCKS_DIR_NAME = "subblocks" @@ -56,7 +55,7 @@ def load_state_dict(checkpoint_dir: Path | str) -> dict[str, torch.Tensor]: if (checkpoint_dir / SAFE_WEIGHTS_INDEX_NAME).exists() or ( checkpoint_dir / SAFE_WEIGHTS_NAME ).exists(): - from modelopt.torch._compress.tools.sharded_checkpoint_utils import ( + from modelopt.torch.puzzletron.tools.sharded_checkpoint_utils import ( load_sharded_state_dict, # local import to avoid circular import ) @@ -124,9 +123,7 @@ def init_empty_module( def skip_init(module_cls, *args, **kwargs) -> nn.Module: - """ - Heavily inspired by torch.nn.utils.skip_init but does not require the module to accept a "device" kwarg. - """ + """Heavily inspired by torch.nn.utils.skip_init but does not require the module to accept a "device" kwarg.""" if not issubclass(module_cls, torch.nn.Module): raise RuntimeError(f"Expected a Module; got {module_cls}") @@ -165,8 +162,7 @@ def copy_tokenizer( target_dir: Path | str, on_failure: Literal["raise", "warn"] = "raise", ) -> None: - """ - Prefer loading the tokenizer from huggingface hub (when tokenizer_name.txt file is available) + """Prefer loading the tokenizer from huggingface hub (when tokenizer_name.txt file is available) to avoid collision between transformers versions. """ source_tokenizer_name_path = Path(source_dir_or_tokenizer_name) / "tokenizer_name.txt" diff --git a/modelopt/torch/_compress/tools/checkpoint_utils_hf.py b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py similarity index 94% rename from modelopt/torch/_compress/tools/checkpoint_utils_hf.py rename to modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py index 3c73498d5..f52c12d26 100644 --- a/modelopt/torch/_compress/tools/checkpoint_utils_hf.py +++ b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py @@ -14,8 +14,7 @@ # limitations under the License. # mypy: ignore-errors -""" -Provides utilities for loading and saving PyTorch model checkpoints in the Hugging Face format, +"""Provides utilities for loading and saving PyTorch model checkpoints in the Hugging Face format, particularly for DeciLM models. """ @@ -34,13 +33,13 @@ from safetensors.torch import save_file as safe_save_file from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from modelopt.torch._compress.decilm import deci_lm_hf_code -from modelopt.torch._compress.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig -from modelopt.torch._compress.decilm.deci_lm_hf_code.modeling_decilm import DeciLMForCausalLM -from modelopt.torch._compress.tools.common import infer_weights_dtype -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.tools.post_init_sparse import SparsityMethod -from modelopt.torch._compress.tools.robust_json import json_dumps +from modelopt.torch.puzzletron.decilm import deci_lm_hf_code +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.modeling_decilm import DeciLMForCausalLM +from modelopt.torch.puzzletron.tools.common import infer_weights_dtype +from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.tools.post_init_sparse import SparsityMethod +from modelopt.torch.puzzletron.tools.robust_json import json_dumps SAFETENSORS_SUBBLOCKS_DIR_NAME = "subblocks_safetensors" PTH_SUBBLOCKS_DIR_NAME = "subblocks" @@ -70,11 +69,10 @@ def load_checkpoint( model_config_overrides: dict | None = None, ignore_unexpected_config_keys: bool = False, ) -> DeciLMForCausalLM: - """ - Unlike AutoModelForCausalLM.from_pretrained, the models loaded by this function use your + """Unlike AutoModelForCausalLM.from_pretrained, the models loaded by this function use your local repo code, not the code inside the checkpoint. """ - from modelopt.torch._compress.tools.checkpoint_utils import ( + from modelopt.torch.puzzletron.tools.checkpoint_utils import ( load_state_dict, # prevent circular import ) @@ -193,7 +191,7 @@ def _save_checkpoint( def split_checkpoint_to_subblocks(checkpoint_dir: Path | str) -> None: - from modelopt.torch._compress.tools.checkpoint_utils import ( + from modelopt.torch.puzzletron.tools.checkpoint_utils import ( load_state_dict, # prevent circular import ) @@ -374,8 +372,7 @@ def _write_file_process_safe( path: Path | str, write_fn: Callable[[Any, BinaryIO], None] = _write_text, ) -> None: - """ - Write a file in a multi-process safe way. + """Write a file in a multi-process safe way. If another process tries to write the same file using this method, the current process "gives up" and assumes that the matter is being taken care of by another process. @@ -444,9 +441,7 @@ def save_model_config(model_config: DeciLMConfig, checkpoint_dir: Path | str) -> def copy_deci_lm_hf_code(output_dir: Path | str) -> None: - """ - Copy the deci_lm_hf_code directory to the output directory. - """ + """Copy the deci_lm_hf_code directory to the output directory.""" output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) code_dir = Path(deci_lm_hf_code.__file__).parent diff --git a/modelopt/torch/_compress/tools/common.py b/modelopt/torch/puzzletron/tools/common.py similarity index 100% rename from modelopt/torch/_compress/tools/common.py rename to modelopt/torch/puzzletron/tools/common.py diff --git a/modelopt/torch/_compress/tools/hydra_utils.py b/modelopt/torch/puzzletron/tools/hydra_utils.py similarity index 100% rename from modelopt/torch/_compress/tools/hydra_utils.py rename to modelopt/torch/puzzletron/tools/hydra_utils.py diff --git a/modelopt/torch/_compress/tools/kd_model.py b/modelopt/torch/puzzletron/tools/kd_model.py similarity index 100% rename from modelopt/torch/_compress/tools/kd_model.py rename to modelopt/torch/puzzletron/tools/kd_model.py diff --git a/modelopt/torch/_compress/tools/logger.py b/modelopt/torch/puzzletron/tools/logger.py similarity index 92% rename from modelopt/torch/_compress/tools/logger.py rename to modelopt/torch/puzzletron/tools/logger.py index 3e8e213ca..e4b87e377 100644 --- a/modelopt/torch/_compress/tools/logger.py +++ b/modelopt/torch/puzzletron/tools/logger.py @@ -48,13 +48,15 @@ def __init__(self, name, level=logging.DEBUG): self.world_size = int(os.environ.get("WORLD_SIZE", 1)) def dist_log(self, msg: str, ranks: str = "main"): - """ - Log parameter msg with the given ranks. - parameter ranks: - "all": log with all ranks - "main": log with only rank 0 in node 0 - "last": log with only rank -1 in node 0 - "local_main": log with only rank 0 in all nodes + """Log parameter msg with the given ranks. + + Args: + msg: The message to log. + ranks: The ranks to log the message to. Choices are: + "all": log with all ranks + "main": log with only rank 0 in node 0 + "last": log with only rank -1 in node 0 + "local_main": log with only rank 0 in all nodes """ # print(msg, ranks) if ranks not in ["all", "main", "local_main", "last"]: diff --git a/modelopt/torch/_compress/tools/post_init_sparse.py b/modelopt/torch/puzzletron/tools/post_init_sparse.py similarity index 94% rename from modelopt/torch/_compress/tools/post_init_sparse.py rename to modelopt/torch/puzzletron/tools/post_init_sparse.py index 824d0856c..e2c45c403 100644 --- a/modelopt/torch/_compress/tools/post_init_sparse.py +++ b/modelopt/torch/puzzletron/tools/post_init_sparse.py @@ -17,7 +17,7 @@ from torch import nn from torch.nn.utils.prune import custom_from_mask -from modelopt.torch._compress.decilm.deci_lm_hf_code.modeling_decilm import DeciLMForCausalLM +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.modeling_decilm import DeciLMForCausalLM """ Converts a state dictionary from PyTorch's pruning format (with _orig and _mask suffixes) @@ -27,9 +27,7 @@ class SparsityMethod: def calculate_masks(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: - """ - gets a model state_dict, returns a state_dict-like mask_dict with masks - """ + """Gets a model state_dict, returns a state_dict-like mask_dict with masks""" @staticmethod def fix_state_dict_inplace(state_dict, verbose=False, change_dtype=False): @@ -99,9 +97,7 @@ def do_sparsity(self, model: DeciLMForCausalLM, mask_dict=None): class SparsityMethod2o4(SparsityMethod): def calculate_masks(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: - """ - gets a model state_dict, returns a state_dict-like mask_dict with masks - """ + """Gets a model state_dict, returns a state_dict-like mask_dict with masks""" mask_dict = {} for key, val in state_dict.items(): orig_size = val.shape diff --git a/modelopt/torch/_compress/tools/robust_json.py b/modelopt/torch/puzzletron/tools/robust_json.py similarity index 100% rename from modelopt/torch/_compress/tools/robust_json.py rename to modelopt/torch/puzzletron/tools/robust_json.py diff --git a/modelopt/torch/_compress/tools/sharded_checkpoint_utils.py b/modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py similarity index 94% rename from modelopt/torch/_compress/tools/sharded_checkpoint_utils.py rename to modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py index 7a247bbdf..1cb5e8489 100644 --- a/modelopt/torch/_compress/tools/sharded_checkpoint_utils.py +++ b/modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py @@ -14,8 +14,7 @@ # limitations under the License. # mypy: ignore-errors -""" -Provides utilities for distributed loading, saving, and manipulation of +"""Provides utilities for distributed loading, saving, and manipulation of large language model checkpoints across multiple GPUs/processes. """ @@ -28,25 +27,23 @@ import torch import torch.distributed import torch.nn as nn -from huggingface_hub import split_torch_state_dict_into_shards from safetensors import safe_open from safetensors.torch import load_file as safe_load_file from safetensors.torch import save_file as safe_save_file -from tqdm import tqdm from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME from transformers.utils.hub import cached_file, get_checkpoint_shard_files from typing_extensions import override import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig -from modelopt.torch._compress.decilm.deci_lm_hf_code.modeling_decilm import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.modeling_decilm import ( DeciLMDecoderLayer, DeciLMForCausalLM, rope_type_to_class, ) -from modelopt.torch._compress.tools.checkpoint_utils import load_model_config, load_state_dict -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.utils.utils import EmptyInitOnDevice +from modelopt.torch.puzzletron.tools.checkpoint_utils import load_model_config, load_state_dict +from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.utils.utils import EmptyInitOnDevice class DummyModule(nn.Module): @@ -243,7 +240,7 @@ def create_sharded_model( def load_state_dict_to_shards( model_shard: torch.nn.Module, loaded_state_dict: dict | None = None ) -> None: - from modelopt.torch._compress.sewing_kit.utils import ( + from modelopt.torch.puzzletron.sewing_kit.utils import ( distributed_isend_obj, distributed_recv_obj, ) @@ -291,9 +288,7 @@ def load_state_dict_to_shards( def save_sharded_model( model_shard: torch.nn.Module | dict[str, torch.Tensor], out_path: str | Path ): - """ - out_path is usually output_checkpoint_path / "model.safetensors" - """ + """out_path is usually output_checkpoint_path / "model.safetensors" """ dist.barrier() if isinstance(model_shard, torch.nn.Module): @@ -351,9 +346,7 @@ def load_sharded_state_dict( keys_to_load: Iterable[str] | None = None, device: torch.device | str = "cpu", ) -> dict[str, torch.Tensor]: - """ - keys_to_load: entire state_dict if None, else partial state_dict containing only these keys - """ + """keys_to_load: entire state_dict if None, else partial state_dict containing only these keys""" shard_paths = _resolve_shard_paths(model_name_or_path) # print(f"shard_paths: {shard_paths}") partial_state_dict = {} diff --git a/modelopt/torch/_compress/tools/validate_model.py b/modelopt/torch/puzzletron/tools/validate_model.py similarity index 73% rename from modelopt/torch/_compress/tools/validate_model.py rename to modelopt/torch/puzzletron/tools/validate_model.py index 456f9fab8..6c3dc3640 100644 --- a/modelopt/torch/_compress/tools/validate_model.py +++ b/modelopt/torch/puzzletron/tools/validate_model.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Provides a function to validate a model. Runs a model forward pass on a dataset and calculates +"""Provides a function to validate a model. Runs a model forward pass on a dataset and calculates the loss, and optionally registers hooks to capture the inputs and the outputs of pytorch modules that are used for activation scoring for pruning. @@ -36,19 +35,19 @@ ) import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.activation_scoring.activation_hooks.utils import ( +from modelopt.torch.puzzletron.activation_scoring.activation_hooks.utils import ( register_activation_hooks, ) -from modelopt.torch._compress.tools.checkpoint_utils_hf import load_checkpoint -from modelopt.torch._compress.tools.logger import aprint, mprint -from modelopt.torch._compress.tools.sharded_checkpoint_utils import load_and_shard_model -from modelopt.torch._compress.utils.data.dataloaders import create_validation_dataloader -from modelopt.torch._compress.utils.parsing import simple_parse_args_string -from modelopt.torch._compress.utils.validate_runtime_pipeline import ( +from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import load_checkpoint +from modelopt.torch.puzzletron.tools.logger import aprint, mprint +from modelopt.torch.puzzletron.tools.sharded_checkpoint_utils import load_and_shard_model +from modelopt.torch.puzzletron.utils.data.dataloaders import create_validation_dataloader +from modelopt.torch.puzzletron.utils.parsing import simple_parse_args_string +from modelopt.torch.puzzletron.utils.validate_runtime_pipeline import ( HiddenStatesAndLMHead, calculate_losses_pipeline, ) -from modelopt.torch._compress.utils.validation import calculate_losses +from modelopt.torch.puzzletron.utils.validation import calculate_losses """ Two goals: @@ -80,40 +79,45 @@ def validate_model( Args: args: Configuration object containing the following attributes: - Model Configuration: - - model_name_or_path (str): Path to model checkpoint or HuggingFace model name. - Required unless model is passed directly. - - model_dtype (str or torch.dtype): Model data type (e.g., "torch.bfloat16", torch.float16). - - autocast_dtype (str or torch.dtype): Autocast data type for mixed precision. - - Dataset Configuration: - - dataset_path (str): Path to the validation dataset. - - tokenizer_name (str, optional): Tokenizer name/path. Uses model_name_or_path if not specified. - - data_column (str): Column name in dataset containing text data. - - block_size (int): Maximum sequence length for tokenization. - - eval_samples (int, optional): Number of samples to evaluate. Uses all if None. - - val_dataset_name (str): Name of validation dataset split. - - source_datasets_to_discard (list[str], optional): List of source datasets to exclude. - - load_dataset_fn (callable, optional): Custom function to load the dataset. - - Data Processing: - - micro_batch_size (int): Batch size for evaluation. - - seed (int): Random seed for reproducibility. - - shuffle_seed (int, optional): Seed for shuffling data. Uses seed if None. - - varlen (bool): Enable variable-length sequences. - - bos_rate (float): Rate of adding BOS token. - - fim_rate (float): Fill-in-the-middle rate for code completion tasks. - - fim_spm_rate (float): SPM-based fill-in-the-middle rate. - - Activation Hooks: - - activations_log_dir (str, optional): Directory to log activation scores. If provided, - hooks will be registered to capture activations. - - activation_hooks_kwargs (str or dict, optional): Arguments for activation hooks. - If string, comma-separated format: "arg1=val1,arg2=val2". - - Execution Options: - - calc_losses_on_cpu (bool): Calculate losses on CPU to avoid OOM. Very slow, not recommended. - - write_results (bool): Write validation results to file. + Model Configuration attributes: + + - ``model_name_or_path`` (str): Path to model checkpoint or HuggingFace model name. + Required unless model is passed directly. + - ``model_dtype`` (str or torch.dtype): Model data type (e.g., "torch.bfloat16", torch.float16). + - ``autocast_dtype`` (str or torch.dtype): Autocast data type for mixed precision. + + Dataset Configuration attributes: + + - ``dataset_path`` (str): Path to the validation dataset. + - ``tokenizer_name`` (str, optional): Tokenizer name/path. Uses model_name_or_path if not specified. + - ``data_column`` (str): Column name in dataset containing text data. + - ``block_size`` (int): Maximum sequence length for tokenization. + - ``eval_samples`` (int, optional): Number of samples to evaluate. Uses all if None. + - ``val_dataset_name`` (str): Name of validation dataset split. + - ``source_datasets_to_discard`` (list[str], optional): List of source datasets to exclude. + - ``load_dataset_fn`` (callable, optional): Custom function to load the dataset. + + Data Processing attributes: + + - ``micro_batch_size`` (int): Batch size for evaluation. + - ``seed`` (int): Random seed for reproducibility. + - ``shuffle_seed`` (int, optional): Seed for shuffling data. Uses seed if None. + - ``varlen`` (bool): Enable variable-length sequences. + - ``bos_rate`` (float): Rate of adding BOS token. + - ``fim_rate`` (float): Fill-in-the-middle rate for code completion tasks. + - ``fim_spm_rate`` (float): SPM-based fill-in-the-middle rate. + + Activation Hooks attributes: + + - ``activations_log_dir`` (str, optional): Directory to log activation scores. + If provided, hooks will be registered to capture activations. + - ``activation_hooks_kwargs`` (str or dict, optional): Arguments for activation hooks. + If string, comma-separated format: "arg1=val1,arg2=val2". + + Execution Options attributes: + + - ``calc_losses_on_cpu`` (bool): Calculate losses on CPU to avoid OOM. Very slow, not recommended. + - ``write_results`` (bool): Write validation results to file. model: Pre-loaded model. If None, will be loaded from args.model_name_or_path. tokenizer: Pre-loaded tokenizer. If None, will be loaded based on args. @@ -121,16 +125,17 @@ def validate_model( return_hidden_states: Whether to return hidden states from the model. pipeline_parallel: Enable pipeline parallelism for large models. calculate_full_score_ablations: Calculate comprehensive teacher similarity scores. - False calculates only a small suite for efficiency. + False calculates only a small suite for efficiency. val_dataloader: Pre-created validation dataloader. If None, will be created from args. Returns: A tuple containing: + - losses: Dictionary mapping loss names to loss statistics (avg, per_sample). - hidden_states_per_batch: Hidden states and LM head outputs if return_hidden_states is True, else None. + Returns (None, None) if not on master rank. """ - if val_dataloader is None: val_dataloader = prepare_dataloader(args, tokenizer) if dist.is_master() else None validation_full_iters = ( @@ -157,7 +162,7 @@ def validate_model( ) # Create checkpoint manager with hooks - from modelopt.torch._compress.utils.checkpoint_manager import ScoringCheckpointManager + from modelopt.torch.puzzletron.utils.checkpoint_manager import ScoringCheckpointManager mprint( f"Creating checkpoint manager with {len(activation_hooks)} hooks for dir: {args.activations_log_dir}" diff --git a/modelopt/torch/_compress/tools/validate_puzzle_with_multi_replacements.py b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py similarity index 70% rename from modelopt/torch/_compress/tools/validate_puzzle_with_multi_replacements.py rename to modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py index 6bc4d11b3..4e3266df4 100644 --- a/modelopt/torch/_compress/tools/validate_puzzle_with_multi_replacements.py +++ b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py @@ -21,11 +21,9 @@ # mypy: ignore-errors import json -import shutil import warnings from functools import partial from pathlib import Path -from typing import Optional import torch from omegaconf import DictConfig @@ -33,25 +31,25 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig -from modelopt.torch._compress.replacement_library.replacement_library import ReplacementLibrary -from modelopt.torch._compress.replacement_library.replacement_utils import parse_layer_replacement -from modelopt.torch._compress.tools import validate_model -from modelopt.torch._compress.tools.checkpoint_utils import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.configuration_decilm import DeciLMConfig +from modelopt.torch.puzzletron.replacement_library.replacement_library import ReplacementLibrary +from modelopt.torch.puzzletron.replacement_library.replacement_utils import parse_layer_replacement +from modelopt.torch.puzzletron.tools import validate_model +from modelopt.torch.puzzletron.tools.checkpoint_utils import ( SAFETENSORS_SUBBLOCKS_DIR_NAME, copy_tokenizer, ) -from modelopt.torch._compress.tools.checkpoint_utils_hf import ( +from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import ( copy_deci_lm_hf_code, save_checkpoint, save_safetensors_index, ) -from modelopt.torch._compress.tools.validation_utils import ( +from modelopt.torch.puzzletron.tools.validation_utils import ( validate_model_and_extract_hidden_states, validate_model_with_teacher_similarity_metrics, ) -from modelopt.torch._compress.utils.parsing import get_nested_key, parse_path -from modelopt.torch._compress.utils.validate_runtime_pipeline import perform_pipeline_stitches +from modelopt.torch.puzzletron.utils.parsing import get_nested_key +from modelopt.torch.puzzletron.utils.validate_runtime_pipeline import perform_pipeline_stitches """ Usage Example: @@ -70,51 +68,58 @@ def validate_puzzle_solutions(args: DictConfig) -> None: Args: args: Configuration object containing the following attributes: - Puzzle Configuration (Required): - - replacement_library_path (Path): Path to the replacement library JSON file. - - solutions_path (Path): Path to puzzle solutions JSON file or directory containing solution files. - - solutions_to_validate (list[int], optional): Indices of specific solutions to validate. - Validates all solutions if None. - - sort_solutions_by (str, optional): JSON field path to sort solutions by before validation. - - bigger_is_better (bool): If True, sort solutions in descending order. Used with sort_solutions_by. - - skip_validation (bool): If True, skip model validation and only save models if requested. - - save_models (bool): If True, save realized model checkpoints for each solution. - - Teacher/Tokenizer Configuration: - - teacher_dir (Path, optional): Path to teacher model directory. Auto-inferred if not provided. - - tokenizer_name (str, optional): Tokenizer name/path. Uses teacher_dir if not specified. - - Model Configuration (Required if skip_validation=False): - - model_dtype (str or torch.dtype): Model data type (e.g., "torch.bfloat16", torch.float16). - - autocast_dtype (str or torch.dtype): Autocast data type for mixed precision. - - Dataset Configuration (Required if skip_validation=False): - - dataset_path (str): Path to the validation dataset. - - data_column (str): Column name in dataset containing text data. - - block_size (int): Maximum sequence length for tokenization. - - eval_samples (int, optional): Number of samples to evaluate. - - val_dataset_name (str): Name of validation dataset split. - - source_datasets_to_discard (list[str], optional): List of source datasets to exclude. - - load_dataset_fn (callable, optional): Custom function to load the dataset. - - Data Processing (Required if skip_validation=False): - - micro_batch_size (int): Batch size for evaluation. - - seed (int): Random seed for reproducibility. - - shuffle_seed (int, optional): Seed for shuffling data. - - varlen (bool): Enable variable-length sequences. - - bos_rate (float): Rate of adding BOS token. - - fim_rate (float): Fill-in-the-middle rate for code completion tasks. - - fim_spm_rate (float): SPM-based fill-in-the-middle rate. - - Output Configuration: - - output_dir (Path, optional): Directory to save validation results. - Auto-generated from solutions_path if not provided. - - Execution Options (Optional if skip_validation=False): - - calc_losses_on_cpu (bool): Calculate losses on CPU to avoid OOM. - - write_results (bool): Write validation results to file. - - activations_log_dir (str, optional): Directory to log activation scores. - - activation_hooks_kwargs (str or dict, optional): Arguments for activation hooks. + Puzzle Configuration (Required) attributes: + + - ``replacement_library_path`` (Path): Path to the replacement library JSON file. + - ``solutions_path`` (Path): Path to puzzle solutions JSON file or directory containing solution files. + - ``solutions_to_validate`` (list[int], optional): Indices of specific solutions to validate. + Validates all solutions if None. + - ``sort_solutions_by`` (str, optional): JSON field path to sort solutions by before validation. + - ``bigger_is_better`` (bool): If True, sort solutions in descending order. Used with sort_solutions_by. + - ``skip_validation`` (bool): If True, skip model validation and only save models if requested. + - ``save_models`` (bool): If True, save realized model checkpoints for each solution. + + Teacher/Tokenizer Configuration attributes: + + - ``teacher_dir`` (Path, optional): Path to teacher model directory. Auto-inferred if not provided. + - ``tokenizer_name`` (str, optional): Tokenizer name/path. Uses teacher_dir if not specified. + + Model Configuration (Required if skip_validation=False) attributes: + + - ``model_dtype`` (str or torch.dtype): Model data type (e.g., "torch.bfloat16", torch.float16). + - ``autocast_dtype`` (str or torch.dtype): Autocast data type for mixed precision. + + Dataset Configuration (Required if skip_validation=False) attributes: + + - ``dataset_path`` (str): Path to the validation dataset. + - ``data_column`` (str): Column name in dataset containing text data. + - ``block_size`` (int): Maximum sequence length for tokenization. + - ``eval_samples`` (int, optional): Number of samples to evaluate. + - ``val_dataset_name`` (str): Name of validation dataset split. + - ``source_datasets_to_discard`` (list[str], optional): List of source datasets to exclude. + - ``load_dataset_fn`` (callable, optional): Custom function to load the dataset. + + Data Processing (Required if skip_validation=False) attributes: + + - ``micro_batch_size`` (int): Batch size for evaluation. + - ``seed`` (int): Random seed for reproducibility. + - ``shuffle_seed`` (int, optional): Seed for shuffling data. + - ``varlen`` (bool): Enable variable-length sequences. + - ``bos_rate`` (float): Rate of adding BOS token. + - ``fim_rate`` (float): Fill-in-the-middle rate for code completion tasks. + - ``fim_spm_rate`` (float): SPM-based fill-in-the-middle rate. + + Output Configuration attributes: + + - ``output_dir`` (Path, optional): Directory to save validation results. + Auto-generated from solutions_path if not provided. + + Execution Options (Optional if skip_validation=False) attributes: + + - ``calc_losses_on_cpu`` (bool): Calculate losses on CPU to avoid OOM. + - ``write_results`` (bool): Write validation results to file. + - ``activations_log_dir`` (str, optional): Directory to log activation scores. + - ``activation_hooks_kwargs`` (str or dict, optional): Arguments for activation hooks. Returns: None. Saves validation results and optionally model checkpoints to disk. @@ -273,7 +278,7 @@ def _extract_layer_replacements_from_puzzle_solution( def load_puzzle_solutions( solutions_path: Path, - sort_solutions_by: Optional[str], + sort_solutions_by: str | None, bigger_is_better: bool, ) -> list[dict]: assert solutions_path.exists(), f"{solutions_path=} does not exist" diff --git a/modelopt/torch/_compress/tools/validation_utils.py b/modelopt/torch/puzzletron/tools/validation_utils.py similarity index 88% rename from modelopt/torch/_compress/tools/validation_utils.py rename to modelopt/torch/puzzletron/tools/validation_utils.py index 6f0b1fcb5..697977cda 100644 --- a/modelopt/torch/_compress/tools/validation_utils.py +++ b/modelopt/torch/puzzletron/tools/validation_utils.py @@ -21,7 +21,7 @@ # mypy: ignore-errors from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any import torch from omegaconf import DictConfig, OmegaConf @@ -29,13 +29,13 @@ from transformers import PreTrainedTokenizerBase import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.tools import validate_model -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.tools.robust_json import json_dump -from modelopt.torch._compress.utils.validation import LowMemorySparseTensor +from modelopt.torch.puzzletron.tools import validate_model +from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.tools.robust_json import json_dump +from modelopt.torch.puzzletron.utils.validation import LowMemorySparseTensor if TYPE_CHECKING: - from modelopt.torch._compress.sewing_kit import StitchedModule + from modelopt.torch.puzzletron.sewing_kit import StitchedModule def validate_model_and_extract_hidden_states( @@ -44,7 +44,7 @@ def validate_model_and_extract_hidden_states( tokenizer: PreTrainedTokenizerBase, output_dir: str | Path, model_name: str, - extra_payload: Optional[dict[str, Any]] = None, + extra_payload: dict[str, Any] | None = None, pipeline_parallel: bool = False, val_dataloader=None, ) -> list[torch.Tensor | LowMemorySparseTensor]: @@ -77,7 +77,7 @@ def validate_model_with_teacher_similarity_metrics( target_hidden_states_per_batch: list[torch.Tensor], output_dir: str | Path, model_name: str, - extra_payload: Optional[dict[str, Any]] = None, + extra_payload: dict[str, Any] | None = None, pipeline_parallel: bool = False, calculate_full_score_ablations: bool = False, val_dataloader=None, diff --git a/modelopt/torch/_compress/utils/checkpoint_manager.py b/modelopt/torch/puzzletron/utils/checkpoint_manager.py similarity index 93% rename from modelopt/torch/_compress/utils/checkpoint_manager.py rename to modelopt/torch/puzzletron/utils/checkpoint_manager.py index b43c37481..3fc4bf87e 100644 --- a/modelopt/torch/_compress/utils/checkpoint_manager.py +++ b/modelopt/torch/puzzletron/utils/checkpoint_manager.py @@ -13,25 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Checkpoint manager for activation hook scoring with periodic saves and resume support. -""" +"""Checkpoint manager for activation hook scoring with periodic saves and resume support.""" import json import time from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.tools.logger import aprint, mprint +from modelopt.torch.puzzletron.tools.logger import aprint, mprint class ScoringCheckpointManager: """Manages checkpointing for activation hook scoring with periodic saves.""" def __init__(self, checkpoint_dir: str, activation_hooks=None, checkpoint_interval: int = 100): - """ - Initialize checkpoint manager. + """Initialize checkpoint manager. Args: checkpoint_dir: Directory to save checkpoints @@ -63,9 +60,8 @@ def __init__(self, checkpoint_dir: str, activation_hooks=None, checkpoint_interv if self.is_main_process: self.checkpoint_dir.mkdir(parents=True, exist_ok=True) - def load_checkpoint(self) -> Optional[Dict[str, Any]]: - """ - Load existing checkpoint if available, including hook states. + def load_checkpoint(self) -> dict[str, Any] | None: + """Load existing checkpoint if available, including hook states. Returns: Dict with checkpoint info or None if no checkpoint exists @@ -76,7 +72,7 @@ def load_checkpoint(self) -> Optional[Dict[str, Any]]: return None try: - with open(self.progress_file, "r") as f: + with open(self.progress_file) as f: checkpoint_data = json.load(f) # Validate checkpoint @@ -114,8 +110,7 @@ def load_checkpoint(self) -> Optional[Dict[str, Any]]: return None def load_hook_states(self, activation_hooks) -> bool: - """ - Load hook states from checkpoint files. + """Load hook states from checkpoint files. Args: activation_hooks: Hook objects to load states into @@ -173,8 +168,7 @@ def should_skip_batch(self, batch_idx: int) -> bool: return should_skip def update_progress(self, batch_idx: int, total_batches: int): - """ - Update progress and potentially save checkpoint. + """Update progress and potentially save checkpoint. Args: batch_idx: Current batch index @@ -207,8 +201,7 @@ def update_progress(self, batch_idx: int, total_batches: int): dist.barrier() def save_checkpoint(self): - """ - Save current checkpoint to disk (progress info only). + """Save current checkpoint to disk (progress info only). Hook states are saved separately in update_progress. """ try: diff --git a/modelopt/torch/_compress/utils/data/dataloaders.py b/modelopt/torch/puzzletron/utils/data/dataloaders.py similarity index 97% rename from modelopt/torch/_compress/utils/data/dataloaders.py rename to modelopt/torch/puzzletron/utils/data/dataloaders.py index 865ad89fb..892d1f3c2 100644 --- a/modelopt/torch/_compress/utils/data/dataloaders.py +++ b/modelopt/torch/puzzletron/utils/data/dataloaders.py @@ -13,9 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -DataLoader utilities for language model training and validation. -""" +"""DataLoader utilities for language model training and validation.""" from collections.abc import Callable, Mapping, Sequence from functools import partial @@ -30,8 +28,8 @@ from tqdm import tqdm from transformers import PreTrainedTokenizerBase -from modelopt.torch._compress.tools.logger import mprint -from modelopt.torch._compress.utils.data.dataset import ConstantLengthDataset +from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.utils.data.dataset import ConstantLengthDataset def collate_none_fn( diff --git a/modelopt/torch/_compress/utils/data/dataset.py b/modelopt/torch/puzzletron/utils/data/dataset.py similarity index 96% rename from modelopt/torch/_compress/utils/data/dataset.py rename to modelopt/torch/puzzletron/utils/data/dataset.py index 342b0821e..a71049105 100644 --- a/modelopt/torch/_compress/utils/data/dataset.py +++ b/modelopt/torch/puzzletron/utils/data/dataset.py @@ -14,14 +14,12 @@ # limitations under the License. # mypy: ignore-errors import functools -from typing import Optional, Sequence +from collections.abc import Sequence import numpy as np import torch from torch.utils.data import IterableDataset -from modelopt.torch._compress.tools.logger import aprint, mprint - FIM_TOKEN_START = " int: """Calculate the key-value dimension for grouped-query attention. TODO: Consider a better place for this function. + Args: n_heads_in_group: Number of attention heads per key-value group. n_head: Total number of attention heads. @@ -52,6 +53,7 @@ def raise_unknown_subblock_config_error(subblock_config: Any) -> None: """Raise an error for invalid subblock configuration types. TODO: Consider a better place for this function. + Args: subblock_config: The invalid subblock configuration object. @@ -67,6 +69,7 @@ def sizeof_dtype(dtype: torch.dtype) -> int | float: """Return the size in bytes of the given data type. TODO: Consider a better place for this function. + Args: dtype: PyTorch data type or custom type string (e.g., 'nvfp4'). @@ -122,10 +125,10 @@ def solution_to_str(block_configs: list[dict[str, Any] | BlockConfig]) -> str: def block_config_to_str(block_config: BlockConfig | dict[str, Any] | None) -> str | None: - """ - Convert a BlockConfig to a human-readable string representation. + """Convert a BlockConfig to a human-readable string representation. TODO: Consider a better place for this function. + Args: block_config: BlockConfig dataclass or dict containing attention and ffn configs. @@ -150,6 +153,7 @@ def subblock_config_to_str( """Convert a subblock config (FFN, Attention, Mamba, or MoE) to string. TODO: Consider a better place for this function. + Args: subblock_config: FFNConfig, AttentionConfig dataclass or dict. subblock_name: Name of subblock ('ffn', 'attention', 'mamba', 'moe'). @@ -212,8 +216,7 @@ def subblock_config_to_str( class EmptyInitOnDevice(torch.overrides.TorchFunctionMode): def __init__(self, device=None, dtype=None): - """ - Create tensors with given device and dtype and don't run initialization + """Create tensors with given device and dtype and don't run initialization (but instead use "empty tensors", i.e. uninitialized memory). device: `torch.device` to work with @@ -222,8 +225,8 @@ def __init__(self, device=None, dtype=None): Example:: with EmptyInitOnDevice("cuda", dtype=torch.bfloat16): model = LLaMA(model_config) - model.load_state_dict(torch.load("llama-lit/7B/lit-llama.pth"))""" - + model.load_state_dict(torch.load("llama-lit/7B/lit-llama.pth")) + """ self.device = device self.dtype = dtype diff --git a/modelopt/torch/_compress/utils/validate_runtime_pipeline.py b/modelopt/torch/puzzletron/utils/validate_runtime_pipeline.py similarity index 92% rename from modelopt/torch/_compress/utils/validate_runtime_pipeline.py rename to modelopt/torch/puzzletron/utils/validate_runtime_pipeline.py index b3be70644..db1e8f2ce 100644 --- a/modelopt/torch/_compress/utils/validate_runtime_pipeline.py +++ b/modelopt/torch/puzzletron/utils/validate_runtime_pipeline.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Model evaluation utilities for models split across multiple GPUs in pipeline-parallel mode. +"""Model evaluation utilities for models split across multiple GPUs in pipeline-parallel mode. Coordinates forward passes and loss computation through model shards distributed across GPUs using sewing_kit's StitchedModule framework. Relies on validation.py for core loss computation. @@ -29,11 +28,11 @@ from tqdm import tqdm import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.decilm.deci_lm_hf_code.modeling_decilm import ( +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.modeling_decilm import ( DeciLMForCausalLM, LMHead, ) -from modelopt.torch._compress.sewing_kit import ( +from modelopt.torch.puzzletron.sewing_kit import ( ExternalTarget, InputArgs, ModuleTarget, @@ -41,15 +40,15 @@ RemoteTarget, StitchedModule, ) -from modelopt.torch._compress.sewing_kit.core import InputReducer -from modelopt.torch._compress.sewing_kit.utils import ( +from modelopt.torch.puzzletron.sewing_kit.core import InputReducer +from modelopt.torch.puzzletron.sewing_kit.utils import ( distributed_recv_obj, distributed_send_obj, fake_tensor, ) -from modelopt.torch._compress.tools.checkpoint_utils import init_module_with_state_dict -from modelopt.torch._compress.tools.sharded_checkpoint_utils import DummyBlock -from modelopt.torch._compress.utils.validation import _organize_outputs, calculate_batch_outputs +from modelopt.torch.puzzletron.tools.checkpoint_utils import init_module_with_state_dict +from modelopt.torch.puzzletron.tools.sharded_checkpoint_utils import DummyBlock +from modelopt.torch.puzzletron.utils.validation import _organize_outputs, calculate_batch_outputs class HiddenStatesAndLMHead(list): @@ -70,8 +69,7 @@ def calculate_losses_pipeline( checkpoint_manager=None, autocast_dtype: torch.dtype = torch.bfloat16, ) -> tuple[dict[str, dict], HiddenStatesAndLMHead | None] | tuple[None, None]: - """ - Do model forward on each batch and calculate LM loss. + """Do model forward on each batch and calculate LM loss. Optionally also calculate kl_div loss and other metrics from given target_hidden_states_per_batch. Optionally return hidden states per batch. Does not support data-parallel. diff --git a/modelopt/torch/_compress/utils/validation.py b/modelopt/torch/puzzletron/utils/validation.py similarity index 97% rename from modelopt/torch/_compress/utils/validation.py rename to modelopt/torch/puzzletron/utils/validation.py index d970105e6..0fff90754 100644 --- a/modelopt/torch/_compress/utils/validation.py +++ b/modelopt/torch/puzzletron/utils/validation.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Model validation and loss calculation utilities for single-GPU and multi-GPU setups. +"""Model validation and loss calculation utilities for single-GPU and multi-GPU setups. Also provides helper functions for loss metrics, KL divergence, JS divergence, and similarity losses for knowledge distillation. @@ -34,7 +33,7 @@ from transformers.generation.logits_process import TopKLogitsWarper, TopPLogitsWarper from typing_extensions import Self -from modelopt.torch._compress.tools import kd_model +from modelopt.torch.puzzletron.tools import kd_model class UnshardedLowMemorySparseTensor: @@ -94,8 +93,7 @@ def calculate_losses( return_probs: bool = False, checkpoint_manager=None, ) -> tuple[dict[str, dict], None] | tuple[None, None]: - """ - Do model forward on each batch and calculate LM loss. + """Do model forward on each batch and calculate LM loss. Works on lit-llama models (single gpu) and huggingface models (can be multi gpu). Does not support data-parallel. @@ -313,8 +311,7 @@ def _calculate_teacher_similarity_scores( target_logits: torch.Tensor, calculate_full_score_ablations: bool, ) -> dict[str, list[float]]: - """ - hidden_states: [batch, tokens, n_embd] + """hidden_states: [batch, tokens, n_embd] target_hidden_states: [batch, tokens, n_embd] logits: [batch, tokens, vocab] target_logits: [batch, tokens, vocab] @@ -443,9 +440,7 @@ class ClipEpsilon(Enum): def _logits_to_logprobs( logits: torch.Tensor, clip_epsilon: ClipEpsilon, epsilon_factor: float ) -> torch.Tensor: - """ - logits: [tokens, vocab] - """ + """logits: [tokens, vocab]""" logprobs = logits.log_softmax( -1 ) # must normalize logits before clipping otherwise log(1/voacb) means nothing @@ -467,8 +462,7 @@ def kl_div( clip_epsilon: ClipEpsilon = ClipEpsilon.NO_CLIP, epsilon_factor: float = 1.0, ) -> float: - """ - Kullback-Leibler Divergence for a single sample. + """Kullback-Leibler Divergence for a single sample. logits: [tokens, vocab] target_probs: [tokens, vocab] """ @@ -487,8 +481,7 @@ def js_div( clip_epsilon: ClipEpsilon = ClipEpsilon.NO_CLIP, epsilon_factor: float = 1.0, ) -> float: - """ - Jensen-Shannon Divergence for a single sample. + """Jensen-Shannon Divergence for a single sample. logits: [tokens, vocab] target_probs: [tokens, vocab] """ @@ -508,8 +501,7 @@ def tv_dist( clip_epsilon: ClipEpsilon = ClipEpsilon.NO_CLIP, epsilon_factor: float = 1.0, ) -> float: - """ - Total Variation Distance (L1-loss) for a single sample. + """Total Variation Distance (L1-loss) for a single sample. logits: [tokens, vocab] target_probs: [tokens, vocab] """ diff --git a/pyproject.toml b/pyproject.toml index 010070e63..0ae6bdf78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,7 @@ extend-ignore = [ "D", "E501", ] # Ignore missing docstrings or line length for Jupyter notebooks -"modelopt/torch/_compress/*" = [ +"modelopt/torch/puzzletron/*" = [ "C4", "D", "E", diff --git a/setup.py b/setup.py index bd14878a5..6096e31ca 100644 --- a/setup.py +++ b/setup.py @@ -102,8 +102,8 @@ "setuptools>=80", "setuptools-scm>=8", ], - # Dependedencies for modelopt.torch._compress subpackage - "compress": [ + # Dependedencies for modelopt.torch.puzzletron subpackage + "puzzletron": [ "fire", "hydra-core==1.3.2", "immutabledict", diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-attn-pruning.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-attn-pruning.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-attn-pruning.yaml rename to tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-attn-pruning.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-ffn-pruning.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-ffn-pruning.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-ffn-pruning.yaml rename to tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-ffn-pruning.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml rename to tests/_test_utils/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/pruning/ffn_pruning.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml rename to tests/_test_utils/torch/puzzletron/resources/configs/pruning/ffn_pruning.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml rename to tests/_test_utils/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml rename to tests/_test_utils/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/validate_model_defaults.yaml similarity index 76% rename from tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml rename to tests/_test_utils/torch/puzzletron/resources/configs/validate_model_defaults.yaml index 192b82c75..1d042d75d 100644 --- a/tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml +++ b/tests/_test_utils/torch/puzzletron/resources/configs/validate_model_defaults.yaml @@ -14,4 +14,4 @@ write_results: false calc_losses_on_cpu: false activations_log_dir: model_name_or_path: -load_dataset_fn: ${get_object:modelopt.torch._compress.utils.data.dataloaders.load_from_disk_fn} +load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn} diff --git a/tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/validate_solutions_defaults.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml rename to tests/_test_utils/torch/puzzletron/resources/configs/validate_solutions_defaults.yaml diff --git a/tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json b/tests/_test_utils/torch/puzzletron/resources/tokenizer/special_tokens_map.json similarity index 100% rename from tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json rename to tests/_test_utils/torch/puzzletron/resources/tokenizer/special_tokens_map.json diff --git a/tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json b/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer.json similarity index 100% rename from tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json rename to tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer.json diff --git a/tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json b/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer_config.json similarity index 100% rename from tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json rename to tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer_config.json diff --git a/tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py b/tests/_test_utils/torch/puzzletron/resources/tokenizer/truncate_tokenizer.py similarity index 100% rename from tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py rename to tests/_test_utils/torch/puzzletron/resources/tokenizer/truncate_tokenizer.py diff --git a/tests/gpu/torch/_compress/compress_test_utils.py b/tests/_test_utils/torch/puzzletron/utils.py similarity index 96% rename from tests/gpu/torch/_compress/compress_test_utils.py rename to tests/_test_utils/torch/puzzletron/utils.py index 1da08602b..6c9feecd0 100644 --- a/tests/gpu/torch/_compress/compress_test_utils.py +++ b/tests/_test_utils/torch/puzzletron/utils.py @@ -22,14 +22,14 @@ from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.tools.hydra_utils import register_hydra_resolvers +from modelopt.torch.puzzletron.tools.hydra_utils import register_hydra_resolvers def setup_test_model_and_data( project_root_path: Path, tmp_path: Path, rank: int ) -> tuple[Path, Path, Path]: """ - Setup the test model and data for the compress NAS search. + Setup the test model and data for the puzzletron NAS search. Args: project_root_path (Path): the root path of the project @@ -111,7 +111,7 @@ def create_tokenizer(project_root_path: Path) -> PreTrainedTokenizerBase: """ Create a tokenizer for the Llama model. """ - tokenizer_path = project_root_path / "tests/gpu/torch/_compress/resources/tokenizer" + tokenizer_path = project_root_path / "tests/_test_utils/torch/puzzletron/resources/tokenizer" tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) return tokenizer diff --git a/tests/gpu/torch/_compress/conftest.py b/tests/gpu/torch/puzzletron/conftest.py similarity index 100% rename from tests/gpu/torch/_compress/conftest.py rename to tests/gpu/torch/puzzletron/conftest.py diff --git a/tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/gpu/torch/puzzletron/decilm/converters/test_convert_llama3_config_to_decilm_config.py similarity index 90% rename from tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py rename to tests/gpu/torch/puzzletron/decilm/converters/test_convert_llama3_config_to_decilm_config.py index 7576f270b..4b1ea0b41 100644 --- a/tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py +++ b/tests/gpu/torch/puzzletron/decilm/converters/test_convert_llama3_config_to_decilm_config.py @@ -16,12 +16,9 @@ import json from pathlib import Path -from gpu.torch._compress.compress_test_utils import ( - create_and_save_small_llama_model, - create_tokenizer, -) +from _test_utils.torch.puzzletron.utils import create_and_save_small_llama_model, create_tokenizer -from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import ( +from modelopt.torch.puzzletron.decilm.converters.convert_llama3_to_decilm import ( convert_llama3_to_decilm, ) diff --git a/tests/gpu/torch/_compress/nas/plugins/test_nas_convert.py b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py similarity index 86% rename from tests/gpu/torch/_compress/nas/plugins/test_nas_convert.py rename to tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py index 913bc2116..c409da28b 100644 --- a/tests/gpu/torch/_compress/nas/plugins/test_nas_convert.py +++ b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py @@ -20,11 +20,11 @@ import torch from _test_utils.torch.distributed.utils import spawn_multiprocess_job -from gpu.torch._compress.compress_test_utils import setup_test_model_and_data +from _test_utils.torch.puzzletron.utils import setup_test_model_and_data import modelopt.torch.nas as mtn import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel +from modelopt.torch.puzzletron.nas.plugins.puzzletron_nas_plugin import PuzzletronModel def test_nas_convert_ffn_pruning(project_root_path: Path, tmp_path: Path): @@ -43,18 +43,18 @@ def _test_nas_convert_ffn_pruning_multiprocess_job( puzzle_dir, llama_checkpoint_path, dataset_path = setup_test_model_and_data( project_root_path, tmp_path, rank ) - hydra_config_dir = project_root_path / "tests/gpu/torch/_compress/resources/configs" + hydra_config_dir = project_root_path / "tests/_test_utils/torch/puzzletron/resources/configs" hydra_config_name = "Llama-3_1-8B-ffn-pruning" # # Run the mnt.convert() step # - input_model = CompressModel() + input_model = PuzzletronModel() mtn.convert( input_model, mode=[ ( - "compress", + "puzzletron", { "puzzle_dir": str(puzzle_dir), "input_model_path": str(llama_checkpoint_path), @@ -82,8 +82,6 @@ def _test_nas_convert_ffn_pruning_multiprocess_job( dist.cleanup() - print("PYTEST SUMMARY: test_nas_convert_ffn_pruning() test has finished successfully") - def test_nas_convert_attn_pruning(project_root_path: Path, tmp_path: Path): spawn_multiprocess_job( @@ -101,18 +99,18 @@ def _test_nas_convert_attn_pruning_multiprocess_job( puzzle_dir, llama_checkpoint_path, dataset_path = setup_test_model_and_data( project_root_path, tmp_path, rank ) - hydra_config_dir = project_root_path / "tests/gpu/torch/_compress/resources/configs" + hydra_config_dir = project_root_path / "tests/_test_utils/torch/puzzletron/resources/configs" hydra_config_name = "Llama-3_1-8B-attn-pruning" # # Run the mnt.convert() step # - input_model = CompressModel() + input_model = PuzzletronModel() mtn.convert( input_model, mode=[ ( - "compress", + "puzzletron", { "puzzle_dir": str(puzzle_dir), "input_model_path": str(llama_checkpoint_path), @@ -142,5 +140,3 @@ def _test_nas_convert_attn_pruning_multiprocess_job( assert (puzzle_dir / "ckpts/n_heads_in_group32").exists() dist.cleanup() - - print("PYTEST SUMMARY: test_nas_convert_attn_pruning() test has finished successfully") diff --git a/tests/gpu/torch/_compress/nas/plugins/test_nas_search.py b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py similarity index 89% rename from tests/gpu/torch/_compress/nas/plugins/test_nas_search.py rename to tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py index 1b4ed93c6..a1258c1d0 100644 --- a/tests/gpu/torch/_compress/nas/plugins/test_nas_search.py +++ b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py @@ -19,11 +19,11 @@ import torch from _test_utils.torch.distributed.utils import spawn_multiprocess_job -from gpu.torch._compress.compress_test_utils import setup_test_model_and_data +from _test_utils.torch.puzzletron.utils import setup_test_model_and_data import modelopt.torch.nas as mtn import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel +from modelopt.torch.puzzletron.nas.plugins.puzzletron_nas_plugin import PuzzletronModel def test_nas_search(project_root_path: Path, tmp_path: Path): @@ -42,18 +42,18 @@ def _test_nas_search_multiprocess_job( puzzle_dir, llama_checkpoint_path, dataset_path = setup_test_model_and_data( project_root_path, tmp_path, rank ) - hydra_config_dir = project_root_path / "tests/gpu/torch/_compress/resources/configs" + hydra_config_dir = project_root_path / "tests/_test_utils/torch/puzzletron/resources/configs" hydra_config_name = "Llama-3_1-8B-ffn-pruning" # # Run the mnt.convert() step # - input_model = CompressModel() + input_model = PuzzletronModel() converted_model = mtn.convert( input_model, mode=[ ( - "compress", + "puzzletron", { "puzzle_dir": str(puzzle_dir), "input_model_path": str(llama_checkpoint_path), @@ -100,5 +100,3 @@ def _test_nas_search_multiprocess_job( assert (puzzle_dir / "mip/puzzle_solutions/target_memory_780000MiB/solutions.json").exists() dist.cleanup() - - print("PYTEST SUMMARY: test_nas_search() test has finished successfully") diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/gpu/torch/puzzletron/test_puzzletron.py similarity index 83% rename from tests/gpu/torch/_compress/test_compress.py rename to tests/gpu/torch/puzzletron/test_puzzletron.py index dd6e0eb5a..faf72f749 100644 --- a/tests/gpu/torch/_compress/test_compress.py +++ b/tests/gpu/torch/puzzletron/test_puzzletron.py @@ -19,11 +19,11 @@ import torch from _test_utils.torch.distributed.utils import spawn_multiprocess_job -from gpu.torch._compress.compress_test_utils import setup_test_model_and_data +from _test_utils.torch.puzzletron.utils import setup_test_model_and_data import modelopt.torch.utils.distributed as dist -from modelopt.torch._compress import compress -from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import ( +from modelopt.torch.puzzletron import puzzletron +from modelopt.torch.puzzletron.decilm.converters.convert_llama3_to_decilm import ( convert_llama3_to_decilm, ) @@ -33,21 +33,23 @@ # Note: Bypass is disabled now in the test. -def test_compress(project_root_path: Path, tmp_path: Path): +def test_puzzletron(project_root_path: Path, tmp_path: Path): spawn_multiprocess_job( size=min(torch.cuda.device_count(), 2), # assertions configured for atmost 2 GPUs - job=partial(_test_compress_multiprocess_job, project_root_path, tmp_path), + job=partial(_test_puzzletron_multiprocess_job, project_root_path, tmp_path), backend="nccl", ) -def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, rank: int, size: int): +def _test_puzzletron_multiprocess_job( + project_root_path: Path, tmp_path: Path, rank: int, size: int +): dist.setup(timeout=timedelta(10)) # Setup the test model and data. puzzle_dir, llama_checkpoint_path, dataset_path = setup_test_model_and_data( project_root_path, tmp_path, rank ) - hydra_config_dir = project_root_path / "tests/gpu/torch/_compress/resources/configs" + hydra_config_dir = project_root_path / "tests/_test_utils/torch/puzzletron/resources/configs" hydra_config_name = "Llama-3_1-8B-ffn-pruning" # Convert the Llama model to DeciLM model. @@ -59,7 +61,9 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran dist.barrier() # Compress the model using a one-click approach - compress.compress(str(hydra_config_dir), hydra_config_name, str(puzzle_dir), str(dataset_path)) + puzzletron.puzzletron( + str(hydra_config_dir), hydra_config_name, str(puzzle_dir), str(dataset_path) + ) # # Check assertions @@ -93,11 +97,6 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran dist.cleanup() - print( - "PYTEST SUMMARY: test_compress_model() test has finished successfully. Puzzle directory: ", - puzzle_dir, - ) - def _assert_score_pruning_activations(puzzle_dir: Path): """Assertions for the score_pruning_activations step 1."""