vllm-project · kylesayrs · Feb 24, 2025 · Mar 10, 2025 · Mar 13, 2025 · Apr 8, 2025
diff --git a/llama4_example.py b/llama4_example.py
@@ -0,0 +1,88 @@
+import requests
+import torch
+from PIL import Image
+from transformers import AutoProcessor, Llama4ForConditionalGeneration
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import skip_weights_download
+from llmcompressor.utils.llama4 import linearize_moe
+
+# Load model.
+model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
+#with skip_weights_download(Llama4ForConditionalGeneration):
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id, torch_dtype=torch.bfloat16  # load on cpu
+)
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+model = linearize_moe(model)
+
+# Oneshot arguments
+DATASET_ID = "flickr30k"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+DATASET_SPLIT = {"calibration": f"test[:{NUM_CALIBRATION_SAMPLES}]"}
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: torch.tensor(value) if key != "pixel_values" else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        for key, value in batch[0].items()
+    }
+
+
+# Recipe
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W4A16",
+        ignore=[
+            "language_model.lm_head",
+            "re:vision_model.*",
+        ],
+        #sequential_targets=["Llama4TextDecoderLayer"],
+        sequential_targets=["Llama4TextAttention", "Llama4TextMLP"],
+    ),
+]
+
+# Perform oneshot
+oneshot(
+    model=model,
+    tokenizer=model_id,
+    dataset=DATASET_ID,
+    splits=DATASET_SPLIT,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+    data_collator=data_collator,
+    oneshot_device="cuda:0",
+)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Please describe the animal in this image\n"},
+            {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
+raw_image = Image.open(requests.get(image_url, stream=True).raw)
+
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
+print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -179,3 +179,7 @@ class DatasetArguments(CustomDatasetArguments):
             "independent]"
         },
     )
+    oneshot_device: Optional[str] = field(
+        default=None,
+        metadata={"help": "Device to run oneshot calibration on"},
+    )
diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
@@ -80,10 +80,6 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether to compress sparse models during save"},
     )
-    oneshot_device: Optional[str] = field(
-        default="cuda:0",
-        metadata={"help": "Device to run oneshot calibration on"},
-    )
     model_revision: str = field(
         default="main",
         metadata={

diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -16,7 +16,7 @@
 
 from llmcompressor.args import ModelArguments, RecipeArguments, TrainingArguments
 from llmcompressor.core import reset_session
-from llmcompressor.pytorch.model_load.helpers import fallback_to_cpu, parse_dtype
+from llmcompressor.pytorch.model_load.helpers import parse_dtype
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
     modify_save_pretrained,
     patch_tied_tensors_bug,
@@ -193,10 +193,6 @@ def initialize_model_from_path(
         else model_args.model_name_or_path
     )
 
-    # Fallback to CPU if GPU requested and not available
-    model_args.oneshot_device = fallback_to_cpu(model_args.oneshot_device)
-
-    device_map = model_args.oneshot_device
     if training_args is not None and training_args.do_train:
         device_map = "auto"
 

diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -147,7 +147,7 @@ def on_start(self, state: State, event: Event, **kwargs):
 
         # register gptq hooks
         added_hook = False
-        for module in state.model.modules():
+        for name, module in state.model.named_modules():
             if getattr_chain(module, "quantization_scheme.weights", None) is not None:
                 # HACK: previously, embeddings were not quantized because they were not
                 # accessible by the layer compressor. For now, we manually ignore it,
@@ -223,6 +223,7 @@ def calibrate_module(
             init_device = (
                 "cpu" if self.offload_hessians else get_execution_device(module)
             )
+            print(f"made hessian {self._module_names[module]}")
             self._hessians[module] = make_empty_hessian(module, device=init_device)
             self._num_samples[module] = 0
 

diff --git a/src/llmcompressor/pipelines/sequential/ast_helpers.py b/src/llmcompressor/pipelines/sequential/ast_helpers.py
@@ -0,0 +1,58 @@
+import ast
+import contextlib
+import inspect
+import linecache
+import sys
+import textwrap
+from typing import List
+
+import torch
+
+from llmcompressor.pipelines.sequential.ast_utils.AutoWrapper import AutoWrapper
+from llmcompressor.utils import patch_attr
+
+
+@contextlib.contextmanager
+def autowrap_forwards(modules: List[torch.nn.Module], ignore: List[str]):
+    with contextlib.ExitStack() as stack:
+        for module in modules:
+            if not isinstance(module, (torch.nn.ModuleList, torch.nn.ModuleDict)):
+                stack.enter_context(autowrap_forward(module, ignore))
+        yield
+
+
+@contextlib.contextmanager
+def autowrap_forward(module: torch.nn.Module, ignore: List[str]):
+    # get source code of module forward
+    source = inspect.getsource(module.forward)
+    source = textwrap.dedent(source)
+    tree = ast.parse(source)
+
+    # construct namespace for our new code
+    defining_module = sys.modules[module.__class__.__module__]
+    namespace = defining_module.__dict__.copy()
+    namespace.update({"torch.fx.wrap": torch.fx.wrap})
+    namespace.update({"self": module})
+
+    # autowrap untraceable code
+    auto_wrapper = AutoWrapper(namespace, ignore)
+    tree = auto_wrapper.auto_wrap(tree)
+
+    # compile new forward function from autowrapped code
+    filename = f"{module.__class__.__name__}_{hash(module)}_autowrapped"
+    code = compile(tree, filename=filename, mode="exec")
+    exec(code, namespace)  # ensure ns of functions is the same ns as torch.fx.wrap
+
+    # enable better tracebacks if autowrapped code fails
+    source_str = ast.unparse(tree)
+    linecache.cache[filename] = (
+        len(source_str),
+        None,
+        [line + "\n" for line in source_str.splitlines()],
+        filename,
+    )
+
+    # patch forward with autowrapped forward
+    new_forward = namespace["forward"].__get__(module)
+    with patch_attr(module, "forward", new_forward):
+        yield