unslothai
diff --git a/‎README.md
Lines changed: 2 additions & 1 deletion b/‎README.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎gemma/config.py
Lines changed: 76 additions & 3 deletions b/‎gemma/config.py
Lines changed: 76 additions & 3 deletions
diff --git a/‎gemma/model.py
Lines changed: 141 additions & 12 deletions b/‎gemma/model.py
Lines changed: 141 additions & 12 deletions
@@ -10,6 +10,7 @@ This is the official PyTorch implementation of Gemma models. We provide model an
 
 ## Updates
 
+[June 26th] Support Gemma v2. You can find the checkpoints [on Kaggle](https://www.kaggle.com/models/google/gemma-2/pytorch) and Hugging Face
 [April 9th] Support CodeGemma. You can find the checkpoints [on Kaggle](https://www.kaggle.com/models/google/codegemma/pytorch) and [Hugging Face](https://huggingface.co/collections/google/codegemma-release-66152ac7b683e2667abdee11)
 [April 5] Support Gemma v1.1. You can find the v1.1 checkpoints [on Kaggle](https://www.kaggle.com/models/google/gemma/frameworks/pyTorch) and [Hugging Face](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b).
 
@@ -28,7 +29,7 @@ huggingface-cli download google/gemma-7b-it-pytorch
 Note that you can choose between the 2B, 7B, 7B int8 quantized variants.
 
 ```
-VARIANT=<2b or 7b>
+VARIANT=<2b or 7b or 9b or 27b>
 CKPT_PATH=<Insert ckpt path here>
 ```
 
 
@@ -15,8 +15,9 @@
 """Gemma model config."""
 
 import dataclasses
+import enum
 import torch
-from typing import Optional
+from typing import Optional, Sequence
 
 
 # Keep a mapping from dtype strings to the supported torch dtypes.
@@ -28,8 +29,20 @@
 })
 
 
+class AttentionType(enum.Enum):
+    GLOBAL = 1
+    LOCAL_SLIDING = 2
+
+
+class Architecture(enum.Enum):
+    GEMMA_1 = 1
+    GEMMA_2 = 2
+
+
 @dataclasses.dataclass
 class GemmaConfig:
+    # The architecture of the model.
+    architecture: Architecture = Architecture.GEMMA_1
     # The number of tokens in the vocabulary.
     vocab_size: int = 256000
     # The maximum sequence length that this model might ever be used with.
@@ -54,6 +67,21 @@ class GemmaConfig:
     quant: bool = False
     # The path to the model tokenizer.
     tokenizer: Optional[str] = 'tokenizer/tokenizer.model'
+    # The types of attention used in the layers of the model.
+    attn_types: Optional[Sequence[AttentionType]] = None
+    # The size of the sliding window used for local attention.
+    sliding_window_size: Optional[int] = None
+    # If provided, the final logits are softcapped to this value.
+    final_logit_softcapping: Optional[float] = None
+    # If provided, the attention logits are softcapped to this value.
+    attn_logit_softcapping: Optional[float] = None
+    # If provided, the query vector is normalized using the
+    # inverse square root of this value instead of head_dim.
+    query_pre_attn_scalar: Optional[int] = None
+    # Whether to use pre mlp normalization.
+    use_pre_ffw_norm: bool = False
+    # Whether to use post mlp normalization.
+    use_post_ffw_norm: bool = False
 
     def get_dtype(self) -> Optional[torch.dtype]:
         """Gets the torch dtype from the config dtype string."""
@@ -74,10 +102,55 @@ def get_config_for_2b() -> GemmaConfig:
     )
 
 
+def get_config_for_9b() -> GemmaConfig:
+    return GemmaConfig(
+        architecture=Architecture.GEMMA_2,
+        num_hidden_layers=42,
+        num_attention_heads=16,
+        num_key_value_heads=8,
+        hidden_size=3584,
+        intermediate_size=14336,
+        use_pre_ffw_norm=True,
+        use_post_ffw_norm=True,
+        final_logit_softcapping=30.0,
+        attn_logit_softcapping=50.0,
+        head_dim=256,
+        attn_types=[AttentionType.LOCAL_SLIDING, AttentionType.GLOBAL] * 21,
+        sliding_window_size=4096,
+        query_pre_attn_scalar=224,  # hidden_size / num_attention_heads
+    )
+
+
+def get_config_for_27b() -> GemmaConfig:
+    return GemmaConfig(
+        architecture=Architecture.GEMMA_2,
+        num_hidden_layers=46,
+        num_attention_heads=32,
+        num_key_value_heads=16,
+        hidden_size=4608,
+        intermediate_size=36864,
+        use_pre_ffw_norm=True,
+        use_post_ffw_norm=True,
+        final_logit_softcapping=30.0,
+        attn_logit_softcapping=50.0,
+        head_dim=128,
+        attn_types=[AttentionType.LOCAL_SLIDING, AttentionType.GLOBAL] * 23,
+        sliding_window_size=4096,
+        query_pre_attn_scalar=144,  # hidden_size / num_attention_heads
+    )
+
+
 def get_model_config(variant: str) -> GemmaConfig:
     if variant == '7b':
         return get_config_for_7b()
     elif variant == '2b':
         return get_config_for_2b()
-    raise ValueError(f'Invalid variant {variant}. Supported variants are "2b"'
-                        'and "7b"')
+    elif variant == '9b':
+        return get_config_for_9b()
+    elif variant == '27b':
+        return get_config_for_27b()
+    else:
+        raise ValueError(
+                f'Invalid variant {variant}. Supported variants are "2b"'
+                 'and "7b" and "9b" and "27b".')
+
@@ -13,6 +13,9 @@
 # limitations under the License.
 """Inference-only Gemma model implementation."""
 
+import json
+import gc
+import os
 import re
 import torch
 from torch import nn
@@ -25,9 +28,10 @@
 
 class Sampler(nn.Module):
 
-    def __init__(self, vocab_size: int):
+    def __init__(self, vocab_size: int, config: gemma_config.GemmaConfig):
         super().__init__()
         self.vocab_size = vocab_size
+        self.config = config
 
     @torch.no_grad()
     def forward(
@@ -47,6 +51,10 @@ def forward(
         logits = torch.matmul(hidden_states, embedding.t())
         if embedding_bias is not None:
             logits += embedding_bias
+        if self.config.final_logit_softcapping is not None:
+            logits.div_(self.config.final_logit_softcapping)
+            logits = torch.tanh(logits)
+            logits.mul_(self.config.final_logit_softcapping)
 
         if temperatures is None:
             return torch.argmax(logits, dim=-1).squeeze(dim=-1), logits
@@ -208,8 +216,12 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        attn_logit_softcapping: Optional[float],
+        query_pre_attn_scalar: Optional[int],
         head_dim: int,
         quant: bool,
+        attn_type: gemma_config.AttentionType,
+        sliding_window_size: Optional[int] = None,
     ):
         super().__init__()
 
@@ -225,7 +237,10 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
 
-        self.scaling = self.head_dim**-0.5
+        if query_pre_attn_scalar is not None:
+            self.scaling = query_pre_attn_scalar**-0.5
+        else:
+            self.scaling = self.head_dim**-0.5
 
         self.qkv_proj = Linear(
             self.hidden_size,
@@ -236,6 +251,10 @@ def __init__(
             self.hidden_size,
             quant=quant)
 
+        self.attn_type = attn_type
+        self.sliding_window_size = sliding_window_size
+        self.attn_logit_softcapping = attn_logit_softcapping
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -283,7 +302,21 @@ def forward(
         v = value.transpose(1, 2)
 
         # [batch_size, n_local_heads, input_len, max_seq_len]
-        scores = torch.matmul(q, k.transpose(2, 3)) * self.scaling
+        q.mul_(self.scaling)
+        scores = torch.matmul(q, k.transpose(2, 3))
+        if (
+            self.attn_type == gemma_config.AttentionType.LOCAL_SLIDING
+            and self.sliding_window_size is not None
+        ):
+            all_ones = torch.ones_like(mask)
+            sliding_mask = torch.triu(
+                all_ones, -1 * self.sliding_window_size + 1
+            ) * torch.tril(all_ones, self.sliding_window_size - 1)
+            mask = torch.where(sliding_mask == 1, mask, -2.3819763e38)
+        if self.attn_logit_softcapping is not None:
+            scores.div_(self.attn_logit_softcapping)
+            scores = torch.tanh(scores)
+            scores.mul_(self.attn_logit_softcapping)
         scores = scores + mask
         scores = F.softmax(scores.float(), dim=-1).type_as(q)
 
@@ -308,8 +341,11 @@ def __init__(
             hidden_size=config.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
+            attn_logit_softcapping=config.attn_logit_softcapping,
+            query_pre_attn_scalar=config.query_pre_attn_scalar,
             head_dim=config.head_dim,
             quant=config.quant,
+            attn_type=gemma_config.AttentionType.GLOBAL,
         )
         self.mlp = GemmaMLP(
             hidden_size=config.hidden_size,
@@ -350,6 +386,77 @@ def forward(
         return hidden_states
 
 
+class Gemma2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: gemma_config.GemmaConfig,
+        attn_type: gemma_config.AttentionType,
+    ):
+        super().__init__()
+        self.self_attn = GemmaAttention(
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            attn_logit_softcapping=config.attn_logit_softcapping,
+            query_pre_attn_scalar=config.query_pre_attn_scalar,
+            head_dim=config.head_dim,
+            quant=config.quant,
+            attn_type=attn_type,
+            sliding_window_size=config.sliding_window_size,
+        )
+        self.mlp = GemmaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant=config.quant,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = (
+            RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.use_pre_ffw_norm
+            else None
+        )
+        self.post_feedforward_layernorm = (
+            RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.use_post_ffw_norm
+            else None
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        kv_write_indices: torch.Tensor,
+        kv_cache: Tuple[torch.Tensor, torch.Tensor],
+        mask: torch.Tensor,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            freqs_cis=freqs_cis,
+            kv_write_indices=kv_write_indices,
+            kv_cache=kv_cache,
+            mask=mask,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # MLP
+        residual = hidden_states
+        if self.pre_feedforward_layernorm is not None:
+            hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.post_feedforward_layernorm is not None:
+            hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
 class GemmaModel(nn.Module):
 
     def __init__(self, config: gemma_config.GemmaConfig):
@@ -358,8 +465,18 @@ def __init__(self, config: gemma_config.GemmaConfig):
         self.vocab_size = config.vocab_size
 
         self.layers = nn.ModuleList()
-        for _ in range(config.num_hidden_layers):
-            self.layers.append(GemmaDecoderLayer(config))
+        for i in range(config.num_hidden_layers):
+            if config.architecture == gemma_config.Architecture.GEMMA_1:
+                self.layers.append(GemmaDecoderLayer(config))
+            elif config.architecture == gemma_config.Architecture.GEMMA_2:
+                attn_type = (
+                    config.attn_types[i]
+                    if config.attn_types is not None
+                    else gemma_config.AttentionType.GLOBAL
+                )
+                self.layers.append(Gemma2DecoderLayer(config, attn_type))
+            else:
+                raise ValueError(f'Unknown architecture: {config.architecture}')
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
@@ -400,7 +517,7 @@ def __init__(
         self.tokenizer = tokenizer.Tokenizer(config.tokenizer)
         self.embedder = Embedding(vocab_size, config.hidden_size, config.quant)
         self.model = GemmaModel(config)
-        self.sampler = Sampler(vocab_size)
+        self.sampler = Sampler(vocab_size, config)
 
         # Pre-compute rotary embedding table.
         rope_theta = getattr(config, 'rope_theta', 10000)
@@ -558,9 +675,21 @@ def generate(
         return results[0] if is_str_prompt else results
 
     def load_weights(self, model_path: str):
-        self.load_state_dict(
-            torch.load(
-                model_path, mmap=True, weights_only=True,
-            )['model_state_dict'],
-            strict=False,
-        )
+        if os.path.isfile(model_path):
+            self.load_state_dict(
+                torch.load(
+                    model_path, mmap=True, weights_only=True,
+                )['model_state_dict'],
+                strict=False,
+            )
+        else:
+            index_path = os.path.join(model_path, 'pytorch_model.bin.index.json')
+            with open(index_path, "r", encoding="utf-8") as f:
+                index = json.load(f)
+            shard_files = list(set(index["weight_map"].values()))
+            for shard_file in shard_files:
+                shard_path = os.path.join(model_path, shard_file)
+                state_dict = torch.load(shard_path, map_location="cpu", weights_only=True)
+                self.load_state_dict(state_dict, strict=False)
+                del state_dict  # Save memory.
+                gc.collect()