[1/4] feat: add gemma 3 27b (#1998)

k223kim · pre-commit-ci[bot] · Borda · web-flow · commit e3088e602fcb · 2025-04-04T04:56:31.000+01:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
Co-authored-by: Thomas Viehmann &lt;tv.code@beamnet.de&gt;
Co-authored-by: Jirka B &lt;j.borovec+github@gmail.com&gt;
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -82,6 +82,7 @@ class Config:
     # The base period of the RoPE embeddings for local attention.
     # If not provided, rope_theta will be used for both local and global attention.
     rope_local_base_freq: Optional[float] = None
+    rope_indices: Optional[List] = None
 
     def __post_init__(self):
         if not self.name:
@@ -1053,6 +1054,45 @@ def norm_class(self) -> Type:
     copy["hf_config"]["name"] = f"{c['hf_config']['name']}-it"
     configs.append(copy)
 
+##################
+# Google Gemma 3
+##################
+gemma3 = [
+    # https://huggingface.co/google/gemma-3-27b-it/blob/main/config.json
+    dict(
+        name="Gemma-3-27b-it",
+        hf_config=dict(org="google", name="gemma-3-27b-it"),
+        scale_embeddings=True,
+        attention_scores_scalar=168,
+        vocab_size=262144,
+        block_size=131072,
+        sliding_window_size=1024,
+        # 5 local layers for every global layer
+        sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(62)],
+        intermediate_size=21504,
+        n_embd=5376,
+        n_layer=62,
+        n_head=32,
+        n_query_groups=16,
+        head_size=128,
+        rotary_percentage=1.0,
+        rope_adjustments=dict(factor=8.0),
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        mlp_class_name="GemmaMLP",
+        gelu_approximate="tanh",
+        post_attention_norm=True,
+        post_mlp_norm=True,
+        norm_qk=True,
+        rope_base=1000000,
+        rope_local_base_freq=10000,
+        # 5 local layers for every global layer
+        rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(62)],
+    ),
+]
+configs.extend(gemma3)
+
 ##################
 # Google CodeGemma
 ##################
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -154,8 +154,18 @@ def forward(
         if self.config.scale_embeddings:
             x = x * torch.tensor(self.config.n_embd**0.5, dtype=x.dtype)
 
-        for block in self.transformer.h:
-            x = block(x, cos, sin, mask, input_pos, input_pos_maxp1)
+        for block_idx, block in enumerate(self.transformer.h):
+            if self.config.rope_indices is not None:
+                x = block(
+                    x,
+                    cos[..., self.config.rope_indices[block_idx]],
+                    sin[..., self.config.rope_indices[block_idx]],
+                    mask,
+                    input_pos,
+                    input_pos_maxp1,
+                )
+            else:
+                x = block(x, cos, sin, mask, input_pos, input_pos_maxp1)
         x = self.transformer.ln_f(x)
         clamp_head = (
             partial(do_softcapping, thresh=self.config.final_logit_softcapping)
@@ -186,6 +196,10 @@ def rope_cache(self, device: Optional[torch.device] = None) -> Tuple[torch.Tenso
             elif num_params_present == 4:
                 # These parameters should always be used together so that we don't interfere with standard rope
                 extra_config = {name: self.config.rope_adjustments[name] for name in adjusted_params_required}
+            elif "factor" in self.config.rope_adjustments:
+                # linear RoPE
+                adjusted_params_required = ["factor"]
+                extra_config = {name: self.config.rope_adjustments[name] for name in adjusted_params_required}
             else:
                 # Some but not all parameters are specified; raise an error
                 missing_params = [
@@ -215,7 +229,10 @@ def set_kv_cache(
         dtype: Optional[torch.dtype] = None,
     ) -> None:
         if rope_cache_length is None:
-            rope_cache_length = self.cos.size(-1)
+            if len(self.cos.shape) == 2:
+                rope_cache_length = self.cos.size(-1)
+            else:
+                rope_cache_length = self.cos[..., 0].size(-1)
 
         if max_seq_length is None:
             max_seq_length = self.max_seq_length
@@ -329,8 +346,8 @@ def __init__(self, config: Config, block_idx: int) -> None:
             self.apply_sliding_window_attention = config.sliding_window_indices[block_idx]
 
         if config.norm_qk:
-            self.norm_q = config.norm_class(config.head_size * config.n_head, eps=config.norm_eps)
-            self.norm_k = config.norm_class(config.head_size * config.n_query_groups, eps=config.norm_eps)
+            self.norm_q = config.norm_class(config.head_size, eps=config.norm_eps)
+            self.norm_k = config.norm_class(config.head_size, eps=config.norm_eps)
         else:
             self.norm_q = self.norm_k = None
 
@@ -370,10 +387,6 @@ def forward(
         # Split qkv into query, key and value matrices.
         q, k, v = qkv.split((query_size, key_size, value_size), dim=-1)  # 3x(B, T, C*)
 
-        if self.config.norm_qk:
-            q = self.norm_q(q)
-            k = self.norm_k(k)
-
         # To place the num_heads (nh) dimension right after the batch (B) dimension, the first step is to decouple the
         # embedding size (C) into num_heads (nh) and head_size (hs).
         q = q.view(B, T, n_head, head_size)  # (B, T, nh_q, hs)
@@ -387,6 +400,10 @@ def forward(
         k = k.transpose(1, 2)  # (B, nh_k, T, hs)
         v = v.transpose(1, 2)  # (B, nh_v, T, hs)
 
+        if self.config.norm_qk:
+            q = self.norm_q(q)
+            k = self.norm_k(k)
+
         # Unlike standard positional embeddings rotary embeddings must be applied at every layer.
         q_roped = apply_rope(q[..., :rope_n_elem], cos, sin)
         k_roped = apply_rope(k[..., :rope_n_elem], cos, sin)
diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
@@ -285,6 +285,77 @@ def copy_weights_gemma_2(
                 pbar.update(progress_per_file)
 
 
+def copy_weights_gemma_3(
+    qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]],
+    state_dict: Dict[str, torch.Tensor],
+    hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    saver: Optional[incremental_save] = None,
+    dtype: Optional[torch.dtype] = None,
+    pbar: Optional[tqdm] = None,
+    progress_per_file: Optional[float] = None,
+    debug_mode: Optional[bool] = False,
+) -> None:
+    weight_map = {
+        "model.embed_tokens.weight": "transformer.wte.weight",
+        "model.layers.{}.self_attn.q_proj.weight": None,
+        "model.layers.{}.self_attn.k_proj.weight": None,
+        "model.layers.{}.self_attn.v_proj.weight": None,
+        "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{}.attn.proj.weight",
+        "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{}.mlp.fc_1.weight",
+        "model.layers.{}.mlp.up_proj.weight": "transformer.h.{}.mlp.fc_2.weight",
+        "model.layers.{}.mlp.down_proj.weight": "transformer.h.{}.mlp.proj.weight",
+        "model.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
+        "model.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.post_attention_norm.weight",
+        "model.layers.{}.pre_feedforward_layernorm.weight": "transformer.h.{}.norm_2.weight",
+        "model.layers.{}.post_feedforward_layernorm.weight": "transformer.h.{}.post_mlp_norm.weight",
+        "model.norm.weight": "transformer.ln_f.weight",
+        "lm_head.weight": "lm_head.weight",
+        "model.layers.{}.self_attn.q_norm.weight": "transformer.h.{}.attn.norm_q.weight",
+        "model.layers.{}.self_attn.k_norm.weight": "transformer.h.{}.attn.norm_k.weight",
+    }
+
+    if progress_per_file is not None:
+        progress_per_file = progress_per_file / max(1, len(hf_weights) + len(qkv_weights))
+
+    for from_name, param in hf_weights.items():
+        name_template, *ids = layer_template(from_name, num_matches=2)
+        to_name = weight_map[name_template]
+        param = load_param(param, from_name, dtype, verbose=debug_mode)
+        if any(w in from_name for w in ("q_proj", "k_proj", "v_proj")):
+            qkv = qkv_weights.setdefault(ids[0], defaultdict(dict))
+            weight_name, weight_type = from_name.split(".")[-2:]
+            qkv[weight_type][weight_name] = param
+
+        if to_name is None:
+            continue
+        to_name = to_name.format(*ids)
+        if saver is not None:
+            param = saver.store_early(param)
+        state_dict[to_name] = param
+
+        if progress_per_file is not None:
+            pbar.update(progress_per_file)
+
+    if "lm_head.weight" not in state_dict:
+        state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"]
+
+    for i in list(qkv_weights):
+        for weight_type in list(qkv_weights[i]):
+            qkv = qkv_weights[i][weight_type]
+            if len(qkv) != 3:
+                # qkv is split across different .bin files
+                continue
+            q = load_param(qkv["q_proj"], f"layer {i} q {weight_type}", dtype, verbose=debug_mode)
+            k = load_param(qkv["k_proj"], f"layer {i} k {weight_type}", dtype, verbose=debug_mode)
+            v = load_param(qkv["v_proj"], f"layer {i} v {weight_type}", dtype, verbose=debug_mode)
+            qkv = torch.cat((q, k, v))
+            state_dict[f"transformer.h.{i}.attn.qkv.{weight_type}"] = qkv
+            del qkv_weights[i][weight_type]
+
+            if progress_per_file is not None:
+                pbar.update(progress_per_file)
+
+
 def copy_weights_phi(
     config: Config,
     qkv_weights: dict,
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -23,6 +23,7 @@
 from transformers.models.falcon import FalconConfig, FalconForCausalLM
 from transformers.models.gemma import GemmaConfig, GemmaForCausalLM
 from transformers.models.gemma2 import Gemma2Config, Gemma2ForCausalLM
+from transformers.models.gemma3 import Gemma3ForCausalLM, Gemma3TextConfig
 from transformers.models.gpt_neox import GPTNeoXConfig, GPTNeoXForCausalLM
 from transformers.models.llama import LlamaConfig, LlamaForCausalLM
 from transformers.models.mistral import MistralConfig, MistralForCausalLM
@@ -36,6 +37,7 @@
 from litgpt.scripts.convert_hf_checkpoint import (
     copy_weights_falcon,
     copy_weights_gemma_2,
+    copy_weights_gemma_3,
     copy_weights_gpt_neox,
     copy_weights_hf_llama,
     copy_weights_phi,
@@ -799,6 +801,77 @@ def test_against_original_gemma_2(model_name, device, dtype):
     torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize("model_name", ["gemma-3-27b-it"])
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                _RunIf(min_cuda_gpus=1),
+            ],
+        ),
+    ],
+)
+def test_against_original_gemma_3(model_name, device, dtype):
+    torch.set_default_dtype(dtype)
+
+    T = 20
+    ours_config = Config.from_name(
+        model_name,
+        block_size=T,
+        sliding_window_size=T // 2,
+        n_layer=2,
+        n_head=16,
+        n_embd=32,
+        intermediate_size=86,
+    )
+
+    theirs_config = Gemma3TextConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        head_dim=ours_config.head_size,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=ours_config.block_size,
+        sliding_window=ours_config.sliding_window_size,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+        attention_bias=ours_config.bias,
+        tie_word_embeddings=True,
+        hidden_act="gelu_pytorch_tanh",
+        attn_implementation="eager",
+        query_pre_attn_scalar=ours_config.attention_scores_scalar,
+        rope_scaling={"factor": 8.0, "rope_type": "linear"},
+        rope_local_base_freq=ours_config.rope_local_base_freq,
+    )
+
+    theirs_model = Gemma3ForCausalLM(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    # Gemma weights are shipped without `lm_head.weight`
+    theirs_state_dict.pop("lm_head.weight")
+    state_dict = {}
+
+    copy_weights_gemma_3({}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.randint(low=0, high=ours_config.padded_vocab_size, size=(T,), device=device).unsqueeze(0)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5)
+
+
 @torch.inference_mode()
 @pytest.mark.parametrize(
     "model_name", ["Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview", "QwQ-32B"]