Skeleton for GGUF conversion (#2018)

mergennachin · facebook-github-bot · commit 9283e506c822 · 2024-03-04T08:25:19.000-08:00
Summary: Starting a skeleton implementation - Only llama for now. Will add new architecture inside gguf_util/converters/ - Only fp32. Will figure out the quantization. - Reusing the existing llama code in examples to reduce duplication. For other architectures, there won't be much duplication. - Currently converting to PyTorch, and then going through export, to_edge, to_executorch. But that's an implementation detail. Pull Request resolved: #2018 Test Plan: `python extension/gguf_util/convert_main.py --gguf_file="/Users/mnachin/models_gguf/OpenHermes-2.5-Mistral-7B-fp16.gguf"` Reviewed By: shoumikhin Differential Revision: D53982833 Pulled By: mergennachin fbshipit-source-id: 5402c0de3e729e434763a5d6a390448603e77429
diff --git a/extension/gguf_util/README.md b/extension/gguf_util/README.md
@@ -0,0 +1,6 @@
+# Summary
+This is an experimental feature to convert [GGUF format](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) to PTE file, which can be executed directly on ExecuTorch.
+
+## Usage:
+
+    python executorch/extension/gguf_util/convert_main.py --gguf_file=<path_to_gguf_file> --pte_file=<output_pte_file>
diff --git a/extension/gguf_util/convert_main.py b/extension/gguf_util/convert_main.py
@@ -0,0 +1,53 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+from executorch.extension.gguf_util.converter import convert_to_pte
+from executorch.extension.gguf_util.load_gguf import load_file
+
+
+def save_pte_program(_, pte_file) -> None:
+    # TODO (mnachin): Save the PTE program to a file
+    print(f"Saving PTE program to {pte_file}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--gguf_file",
+        type=str,
+        help="The GGUF file to load.",
+    )
+    parser.add_argument(
+        "--pte_file",
+        type=str,
+        help="The path to save the PTE file.",
+    )
+    args = parser.parse_args()
+
+    # Step 1: Load the GGUF file
+    gguf_model_args, gguf_weights = load_file(args.gguf_file)
+
+    # Step 2: Convert the GGUF model to PTE
+    # Currently, underneath the hood, it is first converting the GGUF model
+    # to a PyTorch model (nn.Module), then exporting to ET.
+    #
+    # NOTE: In the future, it may makes sense to refactor out the conversion from GGUF to nn.Module
+    # into its own package that can be shared between ExecuTorch and PyTorch core. I can
+    # imagine that there will be a need to do load GGUF file directly into PyTorch core, and
+    # use torch.compile/AOTInductor to accelerate on server, without ever touching ExecuTorch.
+    #
+    # TODO(mnachin): Add a knob to delegate to various backends.
+    pte_program = convert_to_pte(gguf_model_args, gguf_weights)
+
+    # Step 3: Save the PTE program so that
+    # it can be used by the ExecuTorch runtime
+    save_pte_program(pte_program, args.pte_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extension/gguf_util/converter.py b/extension/gguf_util/converter.py
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.extension.gguf_util.load_gguf import GGUFModelArgs, GGUFWeights
+
+
+def convert_to_pte(model_args: GGUFModelArgs, weights: GGUFWeights) -> None:
+    """Convert a GGUF model into a PTE file, an ExecuTorch program.
+
+    Args:
+        model_args: The arguments for the GGUF model.
+        weights: The weights of the GGUF model.
+    """
+
+    # Switch statement based on the architecture enum.
+    # Each enum has its own converter function.
+    if model_args.arch == "llama":
+        from executorch.extension.gguf_util.converters.llama_converter import (
+            convert_to_pte as llama_convert_to_pte,
+        )
+
+        return llama_convert_to_pte(model_args, weights)
+    else:
+        raise NotImplementedError("Unsupported architecture.")
diff --git a/extension/gguf_util/converters/llama_converter.py b/extension/gguf_util/converters/llama_converter.py
@@ -0,0 +1,121 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+from typing import Any, Mapping
+
+import torch
+import torch.nn as nn
+from executorch.examples.models.llama2.llama_transformer import (
+    ModelArgs as LlamaModelArgs,
+    Transformer as LlamaTransformer,
+)
+from executorch.extension.gguf_util.load_gguf import GGUFModelArgs, GGUFWeights
+
+
+def _create_pt_model(
+    gguf_model_args: GGUFModelArgs,
+) -> nn.Module:
+    llama_model_args = LlamaModelArgs(
+        dim=gguf_model_args.embedding_length,
+        n_layers=gguf_model_args.block_count,
+        n_heads=gguf_model_args.attention.head_count,
+        n_kv_heads=gguf_model_args.attention.head_count_kv,
+        vocab_size=gguf_model_args.vocab_size,
+        norm_eps=gguf_model_args.attention.layer_norm_rms_epsilon,
+        hidden_dim=gguf_model_args.feed_forward_length,
+        rope_freq_base=gguf_model_args.rope.freq_base,
+    )
+    pt_model = LlamaTransformer(llama_model_args)
+    pt_model.eval()
+    return pt_model
+
+
+_name_replacements = [
+    ("blk", "layers"),
+    ("token_embd", "tok_embeddings"),
+    ("attn_q", "attention.wq"),
+    ("attn_k", "attention.wk"),
+    ("attn_v", "attention.wv"),
+    ("attn_output", "attention.wo"),
+    ("attn_norm", "attention_norm"),
+    ("output_norm.weight", "norm.weight"),
+    ("ffn_down", "feed_forward.w2"),
+    ("ffn_gate", "feed_forward.w1"),
+    ("ffn_up", "feed_forward.w3"),
+]
+
+
+def _convert_gguf_tensor_name_to_llama_nn(gguf_name: str) -> str:
+    result = copy.deepcopy(gguf_name)
+    for gguf_string, replacement in _name_replacements:
+        result = result.replace(gguf_string, replacement)
+    return result
+
+
+def _convert_to_state_dict(gguf_weights: GGUFWeights) -> Mapping[str, Any]:
+
+    state_dict = {}
+    for tensor in gguf_weights.tensors:
+        gguf_tensor_name = tensor.name
+        nn_tensor_name = _convert_gguf_tensor_name_to_llama_nn(gguf_tensor_name)
+        new_tensor = tensor.data.reshape(tensor.shape).transpose()
+        state_dict[nn_tensor_name] = torch.from_numpy(new_tensor)
+
+    return state_dict
+
+
+def _load_weights_into_nn(
+    pt_model: nn.Module, gguf_model_args: GGUFModelArgs, gguf_weights: GGUFWeights
+):
+
+    state_dict: Mapping[str, Any] = _convert_to_state_dict(gguf_weights)
+
+    # We need to fake initialize the mask, to match with the llama_transformer.py
+    for id in range(gguf_model_args.block_count):
+        mask_name = f"layers.{id}.attention.mask"
+        mask = torch.full(
+            (1, 1, pt_model.params.max_seq_len, pt_model.params.max_seq_len),
+            float("-inf"),
+        )
+        mask = torch.triu(mask, diagonal=1)
+        state_dict[mask_name] = mask
+
+    pt_model.load_state_dict(state_dict)
+    return
+
+
+def _create_pte_program(pt_model: nn.Module) -> bytes:
+    # TODO (mnachin): Export
+    return
+
+
+def convert_to_pte(gguf_model_args: GGUFModelArgs, gguf_weights: GGUFWeights) -> bytes:
+    """Convert a GGUF model into an ExecuTorch program.
+
+    Args:
+        model_args: The arguments for the GGUF model.
+        weights: The weights of the GGUF model.
+    """
+
+    assert (
+        gguf_model_args.arch == "llama"
+    ), "Only LLaMa models are supported by this converter."
+
+    # Step 1: Create the PyTorch model
+    print("Create the PyTorch model")
+    pt_model = _create_pt_model(
+        gguf_model_args,
+    )
+
+    # Step 2: Load the weights into the PyTorch model
+    print("Load the weights into the PyTorch model")
+    _load_weights_into_nn(pt_model, gguf_model_args, gguf_weights)
+
+    # Step 3: Export to ExecuTorch
+    print("Exporting to ExecuTorch.")
+    pte_program = _create_pte_program(pt_model)
+    return pte_program
diff --git a/extension/gguf_util/install_requirements.sh b/extension/gguf_util/install_requirements.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+pip install gguf==0.6.0
diff --git a/extension/gguf_util/load_gguf.py b/extension/gguf_util/load_gguf.py
@@ -0,0 +1,102 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import gguf
+from gguf import GGUFValueType, ReaderTensor
+
+
+@dataclass
+class AttentionArgs:
+    head_count: int
+    head_count_kv: int
+    layer_norm_rms_epsilon: float
+
+
+@dataclass
+class RopeArgs:
+    freq_base: float
+
+
+@dataclass
+class GGUFModelArgs:
+    arch: str
+    embedding_length: int
+    block_count: int
+    feed_forward_length: int
+    vocab_size: int
+    attention: AttentionArgs
+    rope: RopeArgs
+
+
+@dataclass
+class GGUFWeights:
+    tensors: list[ReaderTensor]
+
+
+def _get_metadata(reader: gguf.GGUFReader) -> dict[str, Any]:
+    metadata: dict[str, Any] = {}
+
+    for idx, field in enumerate(reader.fields.values()):
+        val = None
+        if field.types[:1] == [GGUFValueType.ARRAY]:
+            itype = field.types[-1]
+            if itype == GGUFValueType.STRING:
+                val = [
+                    str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data
+                ]
+            else:
+                val = [pv for idx in field.data for pv in field.parts[idx].tolist()]
+        elif field.types[0] == GGUFValueType.STRING:
+            val = str(bytes(field.parts[-1]), encoding="utf-8")
+        else:
+            val = field.parts[-1].tolist()[0]
+
+        metadata[field.name] = val
+
+    return metadata
+
+
+def _build_model_args(metadata: dict[str, Any]) -> GGUFModelArgs:
+    arch = metadata["general.architecture"]
+
+    return GGUFModelArgs(
+        arch=arch,
+        embedding_length=metadata[f"{arch}.embedding_length"],
+        block_count=metadata[f"{arch}.block_count"],
+        feed_forward_length=metadata[f"{arch}.feed_forward_length"],
+        vocab_size=len(metadata["tokenizer.ggml.tokens"]),
+        attention=AttentionArgs(
+            head_count=metadata[f"{arch}.attention.head_count"],
+            head_count_kv=metadata[f"{arch}.attention.head_count_kv"],
+            layer_norm_rms_epsilon=metadata[f"{arch}.attention.layer_norm_rms_epsilon"],
+        ),
+        rope=RopeArgs(
+            freq_base=metadata[f"{arch}.rope.freq_base"],
+        ),
+    )
+
+
+def load_file(gguf_file: str) -> (GGUFModelArgs, GGUFWeights):
+    """
+    Load a GGUF file and return the model arguments and weights.
+    """
+    if not Path(gguf_file).is_file():
+        raise ValueError(f"Could not find file {gguf_file}")
+
+    reader = gguf.GGUFReader(gguf_file, "r")
+
+    # Step 1: Build GGUFModelArgs
+    metadata = _get_metadata(reader)
+    model_args = _build_model_args(metadata)
+
+    # Step 2: Build GGUFWeights
+    gguf_weights = GGUFWeights(tensors=reader.tensors)
+
+    return (model_args, gguf_weights)