vllm-project · brian-dellabetta · Jun 18, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jun 25, 2025
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -304,13 +304,13 @@ def _set_resolved_mappings(self, model: Module) -> None:
         """
         resolved_mappings: list[ResolvedMapping] = []
         for mapping_idx, mapping in enumerate(self.mappings):
-            smooth_layers = get_layers(mapping.smooth_layer, model)
+            smooth_layers = get_layers(
+                mapping.smooth_layer, model, exclude_internal_modules=True
+            )
             smooth_names = [
                 smooth_name
                 for smooth_name in smooth_layers
-                if not find_name_or_class_matches(
-                    smooth_name, model, self.ignore + ["re:.*_observer$"]
-                )
+                if not find_name_or_class_matches(smooth_name, model, self.ignore)
             ]
 
             num_skipped_mappings = 0
@@ -331,6 +331,7 @@ def _set_resolved_mappings(self, model: Module) -> None:
                     for balance_suffix, balance_layer in get_layers(
                         balance_regex,
                         smooth_parent,
+                        exclude_internal_modules=True,
                     ).items():
                         balance_name = f"{smooth_parent_name}.{balance_suffix}"
 

diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py
@@ -74,8 +74,56 @@ class AWQMapping:
     ),
 ]
 
+# Gemma includes a pre_feedforward_layernorm in between
+#  post_attention_layernorm and the mlp down/gate proj layers
+#  use that instead of post_attention_layernorm in 3rd mapping:
+_gemma_mappings = [
+    AWQMapping(
+        "re:.*input_layernorm$",
+        ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"],
+    ),
+    AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]),
+    AWQMapping(
+        "re:.*pre_feedforward_layernorm$",
+        ["re:.*gate_proj$", "re:.*up_proj$"],
+    ),
+    AWQMapping(
+        "re:.*up_proj$",
+        ["re:.*down_proj$"],
+    ),
+]
+
+
+# Cohere architecture is similar to default, with a very fundamental difference.
+# The MLP block is executed in parallel to the attention. So the tensor goes
+# through input_layernorm and then from there it goes directly to the attention
+# module and to the MLP module.
+_cohere_mappings = [
+    AWQMapping(
+        "re:.*input_layernorm$",
+        [
+            "re:.*self_attn.q_proj$",
+            "re:.*self_attn.k_proj$",
+            "re:.*self_attn.v_proj$",
+            "re:.*mlp.gate_proj$",
+            "re:.*mlp.up_proj$",
+        ],
+    ),
+    AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]),
+    AWQMapping(
+        "re:.*up_proj$",
+        ["re:.*down_proj$"],
+    ),
+]
+
 AWQ_MAPPING_REGISTRY: Dict[str, list[AWQMapping]] = {
+    "CohereForCausalLM": _cohere_mappings,
+    "Cohere2ForCausalLM": _cohere_mappings,
+    "Gemma2ForCausalLM": _gemma_mappings,
+    "Gemma3ForCausalLM": _gemma_mappings,
+    "Gemma3ForConditionalGeneration": _gemma_mappings,
     "LlamaForCausalLM": _default_mappings,
+    "Mistral3ForConditionalGeneration": _default_mappings,
     "MistralForCausalLM": _default_mappings,
     "Phi3ForCausalLM": _phi_mappings,
     "Phi3VForCausalLM": _phi_mappings,

diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py
@@ -9,12 +9,13 @@
 
 import torch
 from compressed_tensors.quantization.utils import is_module_quantized
-from packaging import version
+from compressed_tensors.transform import TransformBase
 from torch.nn import Linear, Module, Parameter
 from torch.nn.modules.conv import _ConvNd
 from transformers import PreTrainedModel
 
 from llmcompressor.core import ModelParameterizedLayer
+from llmcompressor.observers import Observer
 from llmcompressor.utils.fsdp.context import (
     fix_fsdp_module_name,
     summon_full_params_context,
@@ -64,10 +65,6 @@
     "get_layer_by_name",
 ]
 
-
-_PARSED_TORCH_VERSION = version.parse(torch.__version__)
-
-
 ALL_TARGET = "__ALL__"
 ALL_PRUNABLE_TARGET = "__ALL_PRUNABLE__"
 ALL_QUANTIZABLE_TARGET = "__ALL_QUANTIZABLE__"
@@ -164,8 +161,46 @@ def match_layers_params(
     return resolved
 
 
-def get_layers(targets: Union[str, List[str]], module: Module) -> Dict[str, Module]:
-    return match_layers_params(targets, module)
+def is_internal_module(module: Module) -> bool:
+    """
+    llm-compressor adds additional modules to a model, like observers
+    and transforms, as part of its normal operation
+
+    :param name: name of module
+    :return: True if name indicates a module internally instantiated by
+        llm-compressor, otherwise False
+    """
+    return isinstance(module, (TransformBase, Observer))
+
+
+def get_layers(
+    targets: Union[str, List[str]],
+    module: Module,
+    exclude_internal_modules: bool = False,
+) -> Dict[str, Module]:
+    """
+    Get layers (also known as submodules) of module based on targets
+
+    :param targets: names or regexes to search for
+        Can be regex, e.g. "re:.*input_layernorm$" to find all layers
+        in module whose names end in string "input_layernorm"
+    :param module: Parent module in which to search for targets
+    :param exclude_internal_modules: If True, don't include internal
+        modules added by llm-compressor, e.g. Observers and Transforms.
+        Defaults to False to maintain backward compatibility
+
+    :return: dict of {layer name -> module} of all layers in module
+        that match targets
+    """
+    layer_dict = match_layers_params(targets, module)
+    if exclude_internal_modules:
+        layer_dict = {
+            name: layer
+            for name, layer in layer_dict.items()
+            if not is_internal_module(layer)
+        }
+
+    return layer_dict
 
 
 def get_layer(target: str, module: Module) -> Tuple[str, Module]: