protobird-git
diff --git a/‎ai_edge_torch/generative/examples/amd_llama_135m/amd_llama_135m.py
Lines changed: 7 additions & 15 deletions b/‎ai_edge_torch/generative/examples/amd_llama_135m/amd_llama_135m.py
Lines changed: 7 additions & 15 deletions
diff --git a/‎ai_edge_torch/generative/examples/amd_llama_135m/convert_to_tflite.py
Lines changed: 2 additions & 1 deletion b/‎ai_edge_torch/generative/examples/amd_llama_135m/convert_to_tflite.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎ai_edge_torch/generative/examples/deepseek/convert_to_tflite.py
Lines changed: 3 additions & 1 deletion b/‎ai_edge_torch/generative/examples/deepseek/convert_to_tflite.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎ai_edge_torch/generative/examples/deepseek/deepseek.py
Lines changed: 7 additions & 15 deletions b/‎ai_edge_torch/generative/examples/deepseek/deepseek.py
Lines changed: 7 additions & 15 deletions
diff --git a/‎ai_edge_torch/generative/examples/gemma/convert_gemma1_to_tflite.py
Lines changed: 2 additions & 1 deletion b/‎ai_edge_torch/generative/examples/gemma/convert_gemma1_to_tflite.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎ai_edge_torch/generative/examples/gemma/convert_gemma2_to_tflite.py
Lines changed: 2 additions & 1 deletion b/‎ai_edge_torch/generative/examples/gemma/convert_gemma2_to_tflite.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎ai_edge_torch/generative/examples/gemma/gemma1.py
Lines changed: 8 additions & 16 deletions b/‎ai_edge_torch/generative/examples/gemma/gemma1.py
Lines changed: 8 additions & 16 deletions
diff --git a/‎ai_edge_torch/generative/examples/gemma/gemma2.py
Lines changed: 24 additions & 24 deletions b/‎ai_edge_torch/generative/examples/gemma/gemma2.py
Lines changed: 24 additions & 24 deletions
diff --git a/‎ai_edge_torch/generative/examples/gemma3/convert_gemma3_to_tflite.py
Lines changed: 2 additions & 1 deletion b/‎ai_edge_torch/generative/examples/gemma3/convert_gemma3_to_tflite.py
Lines changed: 2 additions & 1 deletion
@@ -29,16 +29,8 @@ class AmdLlama(model_builder.DecoderOnlyModel):
   pass
 
 
-def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
-  """Returns the model config for an AMD-Llama-135m model.
-
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 1024.
-
-  Returns:
-    The model config for an AMD-Llama-135m model.
-  """
+def get_model_config() -> cfg.ModelConfig:
+  """Returns the model config for an AMD-Llama-135m model."""
   attn_config = cfg.AttentionConfig(
       num_heads=12,
       head_dim=64,
@@ -63,16 +55,15 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       num_layers=12,
       max_seq_len=2048,
       embedding_dim=768,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
       lm_head_share_weight_with_embedding=False,
   )
   return config
 
 
-def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
-  config = get_model_config(**kwargs)
+def get_fake_model_config() -> cfg.ModelConfig:
+  config = get_model_config()
   config.vocab_size = 128
   config.num_layers = 2
   config.block_config(0).ff_config.intermediate_size = 64
@@ -82,12 +73,13 @@ def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
 def build_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] | None = None,
-    **kwargs
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_model_config(**kwargs),
+      config=get_model_config(),
       tensor_names=TENSOR_NAMES,
       model_class=AmdLlama,
       custom_loader=custom_loader,
+      mask_cache_size=mask_cache_size,
   )
@@ -31,13 +31,14 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+      mask_cache_size=converter.get_mask_cache_size_from_flags(),
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),
 
@@ -23,20 +23,22 @@
 
 flags = converter.define_conversion_flags('deepseek')
 
+
 def main(_):
   checkpoint_path = flags.FLAGS.checkpoint_path
   pytorch_model = deepseek.build_model(
       checkpoint_path,
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+      mask_cache_size=converter.get_mask_cache_size_from_flags(),
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),
 
@@ -29,16 +29,8 @@ class DeepSeekDistillQwen(model_builder.DecoderOnlyModel):
   pass
 
 
-def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
-  """Returns the model config for a Qwen 2.5 3B model.
-
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 1024.
-
-  Returns:
-    The model config for a SmolLM model.
-  """
+def get_model_config() -> cfg.ModelConfig:
+  """Returns the model config for a Qwen 2.5 3B model."""
   attn_config = cfg.AttentionConfig(
       num_heads=12,
       head_dim=128,
@@ -66,16 +58,15 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       num_layers=28,
       max_seq_len=4096,
       embedding_dim=1536,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
       lm_head_share_weight_with_embedding=False,
   )
   return config
 
 
-def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
-  config = get_model_config(**kwargs)
+def get_fake_model_config() -> cfg.ModelConfig:
+  config = get_model_config()
   config.vocab_size = 128
   config.num_layers = 2
   # DeepSeek-R1-Distill-Qwen has only one block config.
@@ -86,12 +77,13 @@ def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
 def build_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_model_config(**kwargs),
+      config=get_model_config(),
       tensor_names=TENSOR_NAMES,
       model_class=DeepSeekDistillQwen,
       custom_loader=custom_loader,
+      mask_cache_size=mask_cache_size,
   )
@@ -31,13 +31,14 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+      mask_cache_size=converter.get_mask_cache_size_from_flags(),
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),
 
@@ -33,13 +33,14 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+      mask_cache_size=converter.get_mask_cache_size_from_flags(),
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),
 
@@ -42,16 +42,8 @@ class Gemma1(model_builder.DecoderOnlyModel):
   pass
 
 
-def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
-  """Returns the model config for a Gemma 2B model.
-
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 1024.
-
-  Returns:
-    The model config for a Gemma 2B model.
-  """
+def get_model_config_2b() -> cfg.ModelConfig:
+  """Returns the model config for a Gemma 2B model."""
   attn_config = cfg.AttentionConfig(
       num_heads=8,
       head_dim=256,
@@ -80,33 +72,33 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       max_seq_len=8192,
       embedding_dim=embedding_dim,
       embedding_scale=embedding_dim**0.5,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
       lm_head_use_bias=False,
   )
   return config
 
 
-def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
-  config = get_model_config_2b(kv_cache_max_len)
+def get_fake_model_config() -> cfg.ModelConfig:
+  config = get_model_config_2b()
   # Gemma has only one block config.
   config.block_config(0).ff_config.intermediate_size = 128
   config.vocab_size = 128
   config.num_layers = 2
-  config.max_seq_len = 2 * kv_cache_max_len
+  config.max_seq_len = 256
   return config
 
 
 def build_2b_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_model_config_2b(**kwargs),
+      config=get_model_config_2b(),
       tensor_names=TENSOR_NAMES,
       model_class=Gemma1,
       custom_loader=custom_loader,
+      mask_cache_size=mask_cache_size,
   )
@@ -104,7 +104,7 @@ def forward(
 class Gemma2(nn.Module):
   """A Gemma2 model built from the Edge Generative API layers."""
 
-  def __init__(self, config: cfg.ModelConfig):
+  def __init__(self, config: cfg.ModelConfig, mask_cache_size: int = 0):
     super().__init__()
 
     # Construct model layers.
@@ -126,17 +126,24 @@ def __init__(self, config: cfg.ModelConfig):
         config.embedding_dim,
         config.final_norm_config,
     )
-    self.mask_cache = attn_utils.build_causal_mask_cache(
-        size=config.kv_cache_max,
-    )
+    self.config = config
+    self.build_mask_cache(mask_cache_size)
+
+  def build_mask_cache(self, mask_cache_size: int):
+    assert (
+        mask_cache_size <= self.config.max_seq_len
+    ), "Mask cache size must be less than or equal to the max seq length."
+    if mask_cache_size <= 0:
+      self.mask_cache = None
+      self.sliding_window_mask_cache = None
+      return
+    self.mask_cache = attn_utils.build_causal_mask_cache(mask_cache_size)
     # Gemma2 has same hyper parameters for each layer except for attention
     # types. Use the first layer.
-    attn_config = config.block_config(0).attn_config
     self.sliding_window_mask_cache = attn_utils.build_sliding_window_mask_cache(
-        size=config.kv_cache_max,
-        window_size=attn_config.sliding_window_size,
+        size=mask_cache_size,
+        window_size=self.config.block_config(0).attn_config.sliding_window_size,
     )
-    self.config = config
 
   def get_attention_mask(
       self, attn_type: cfg.AttentionType, input_pos: torch.Tensor
@@ -167,6 +174,7 @@ def forward(
     n_elem = int(attn_config.rotary_percentage * attn_config.head_dim)
     rope = rotary_pos_emb.build_rope(input_pos, n_elem, attn_config.rotary_base)
     if mask is None:
+      assert self.mask_cache is not None, "Mask cache must be built."
       mask = [
           self.get_attention_mask(
               self.config.block_config(i).attn_config.attn_type, input_pos
@@ -222,16 +230,8 @@ def _forward_with_embeds(
     return {"logits": res, "kv_cache": updated_kv_cache}
 
 
-def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
-  """Returns the model config for a Gemma2 2B model.
-
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 1024.
-
-  Returns:
-    The model config for a Gemma 2B model.
-  """
+def get_model_config_2b() -> cfg.ModelConfig:
+  """Returns the model config for a Gemma2 2B model."""
   norm_config = cfg.NormalizationConfig(
       type=cfg.NormalizationType.RMS_NORM, epsilon=1e-6, zero_centered=True
   )
@@ -277,7 +277,6 @@ def get_block_config(idx: int) -> cfg.TransformerBlockConfig:
       max_seq_len=8192,
       embedding_dim=embedding_dim,
       embedding_scale=embedding_dim**0.5,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=[get_block_config(i) for i in range(num_layers)],
       final_norm_config=norm_config,
       lm_head_use_bias=False,
@@ -286,11 +285,11 @@ def get_block_config(idx: int) -> cfg.TransformerBlockConfig:
   return config
 
 
-def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
-  config = get_model_config_2b(kv_cache_max_len)
+def get_fake_model_config() -> cfg.ModelConfig:
+  config = get_model_config_2b()
   config.vocab_size = 128
   config.num_layers = 2
-  config.max_seq_len = 2 * kv_cache_max_len
+  config.max_seq_len = 256
   config.embedding_dim = 128
   config.embedding_scale = config.embedding_dim**0.5
   config.block_configs = config.block_configs[: config.num_layers]
@@ -305,16 +304,17 @@ def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
 def build_2b_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs,
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   for tensor_names in TENSOR_NAMES_DICT.values():
     try:
       return model_builder.build_decoder_only_model(
           checkpoint_path=checkpoint_path,
-          config=get_model_config_2b(**kwargs),
+          config=get_model_config_2b(),
           tensor_names=tensor_names,
           model_class=Gemma2,
           custom_loader=custom_loader,
+          mask_cache_size=mask_cache_size,
       )
     except KeyError as _:
       continue
@@ -40,7 +40,7 @@ def main(_):
         custom_loader=loader.maybe_get_custom_loader(
             checkpoint_path, flags.FLAGS.custom_checkpoint_loader
         ),
-        kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+        mask_cache_size=converter.get_mask_cache_size_from_flags(),
     )
   else:
     raise ValueError(f'Unsupported model size: {_MODEL_SIZE.value}')
@@ -50,6 +50,7 @@ def main(_):
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),