diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 51e8cf8b9..7d5c3671f 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -304,13 +304,13 @@ def _set_resolved_mappings(self, model: Module) -> None: """ resolved_mappings: list[ResolvedMapping] = [] for mapping_idx, mapping in enumerate(self.mappings): - smooth_layers = get_layers(mapping.smooth_layer, model) + smooth_layers = get_layers( + mapping.smooth_layer, model, exclude_internal_modules=True + ) smooth_names = [ smooth_name for smooth_name in smooth_layers - if not find_name_or_class_matches( - smooth_name, model, self.ignore + ["re:.*_observer$"] - ) + if not find_name_or_class_matches(smooth_name, model, self.ignore) ] num_skipped_mappings = 0 @@ -331,6 +331,7 @@ def _set_resolved_mappings(self, model: Module) -> None: for balance_suffix, balance_layer in get_layers( balance_regex, smooth_parent, + exclude_internal_modules=True, ).items(): balance_name = f"{smooth_parent_name}.{balance_suffix}" diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index 6390445c8..693406ec3 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -74,8 +74,56 @@ class AWQMapping: ), ] +# Gemma includes a pre_feedforward_layernorm in between +# post_attention_layernorm and the mlp down/gate proj layers +# use that instead of post_attention_layernorm in 3rd mapping: +_gemma_mappings = [ + AWQMapping( + "re:.*input_layernorm$", + ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"], + ), + AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]), + AWQMapping( + "re:.*pre_feedforward_layernorm$", + ["re:.*gate_proj$", "re:.*up_proj$"], + ), + AWQMapping( + "re:.*up_proj$", + ["re:.*down_proj$"], + ), +] + + +# Cohere architecture is similar to default, with a very fundamental difference. +# The MLP block is executed in parallel to the attention. So the tensor goes +# through input_layernorm and then from there it goes directly to the attention +# module and to the MLP module. +_cohere_mappings = [ + AWQMapping( + "re:.*input_layernorm$", + [ + "re:.*self_attn.q_proj$", + "re:.*self_attn.k_proj$", + "re:.*self_attn.v_proj$", + "re:.*mlp.gate_proj$", + "re:.*mlp.up_proj$", + ], + ), + AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]), + AWQMapping( + "re:.*up_proj$", + ["re:.*down_proj$"], + ), +] + AWQ_MAPPING_REGISTRY: Dict[str, list[AWQMapping]] = { + "CohereForCausalLM": _cohere_mappings, + "Cohere2ForCausalLM": _cohere_mappings, + "Gemma2ForCausalLM": _gemma_mappings, + "Gemma3ForCausalLM": _gemma_mappings, + "Gemma3ForConditionalGeneration": _gemma_mappings, "LlamaForCausalLM": _default_mappings, + "Mistral3ForConditionalGeneration": _default_mappings, "MistralForCausalLM": _default_mappings, "Phi3ForCausalLM": _phi_mappings, "Phi3VForCausalLM": _phi_mappings, diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py index 835493fa3..c923af413 100644 --- a/src/llmcompressor/utils/pytorch/module.py +++ b/src/llmcompressor/utils/pytorch/module.py @@ -9,12 +9,13 @@ import torch from compressed_tensors.quantization.utils import is_module_quantized -from packaging import version +from compressed_tensors.transform import TransformBase from torch.nn import Linear, Module, Parameter from torch.nn.modules.conv import _ConvNd from transformers import PreTrainedModel from llmcompressor.core import ModelParameterizedLayer +from llmcompressor.observers import Observer from llmcompressor.utils.fsdp.context import ( fix_fsdp_module_name, summon_full_params_context, @@ -64,10 +65,6 @@ "get_layer_by_name", ] - -_PARSED_TORCH_VERSION = version.parse(torch.__version__) - - ALL_TARGET = "__ALL__" ALL_PRUNABLE_TARGET = "__ALL_PRUNABLE__" ALL_QUANTIZABLE_TARGET = "__ALL_QUANTIZABLE__" @@ -164,8 +161,46 @@ def match_layers_params( return resolved -def get_layers(targets: Union[str, List[str]], module: Module) -> Dict[str, Module]: - return match_layers_params(targets, module) +def is_internal_module(module: Module) -> bool: + """ + llm-compressor adds additional modules to a model, like observers + and transforms, as part of its normal operation + + :param name: name of module + :return: True if name indicates a module internally instantiated by + llm-compressor, otherwise False + """ + return isinstance(module, (TransformBase, Observer)) + + +def get_layers( + targets: Union[str, List[str]], + module: Module, + exclude_internal_modules: bool = False, +) -> Dict[str, Module]: + """ + Get layers (also known as submodules) of module based on targets + + :param targets: names or regexes to search for + Can be regex, e.g. "re:.*input_layernorm$" to find all layers + in module whose names end in string "input_layernorm" + :param module: Parent module in which to search for targets + :param exclude_internal_modules: If True, don't include internal + modules added by llm-compressor, e.g. Observers and Transforms. + Defaults to False to maintain backward compatibility + + :return: dict of {layer name -> module} of all layers in module + that match targets + """ + layer_dict = match_layers_params(targets, module) + if exclude_internal_modules: + layer_dict = { + name: layer + for name, layer in layer_dict.items() + if not is_internal_module(layer) + } + + return layer_dict def get_layer(target: str, module: Module) -> Tuple[str, Module]: