Update the converter to support multi-signature in multimodal models.

protobird-git · copybara-github · commit 261960586eff · 2025-04-17T09:31:27.000-07:00
- Pass pixel_seq_len explicitly to specify how many tokens would be reserved for images

PiperOrigin-RevId: 748706889
diff --git a/ai_edge_torch/generative/examples/paligemma/convert_to_tflite.py b/ai_edge_torch/generative/examples/paligemma/convert_to_tflite.py
@@ -48,6 +48,7 @@ def main(_):
       pixel_values_size=torch.Size(
           [1, config.channels, config.image_size, config.image_size]
       ),
+      pixel_seq_len=(config.image_size // config.patch_size) ** 2,
       quantize=flags.FLAGS.quantize,
       config=pytorch_model.config.decoder_config,
       export_config=ExportConfig(),
diff --git a/ai_edge_torch/generative/examples/qwen_vl/convert_to_tflite.py b/ai_edge_torch/generative/examples/qwen_vl/convert_to_tflite.py
@@ -43,6 +43,9 @@ def main(_):
   )
 
   grid_thw = pytorch_model.image_encoder.get_grid_thw()
+  spatial_merge_size = (
+      pytorch_model.config.image_encoder_config.spatial_merge_size
+  )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
@@ -51,6 +54,10 @@ def main(_):
       pixel_values_size=(
           pytorch_model.image_encoder.get_pixel_values_size(grid_thw)
       ),
+      pixel_seq_len=(
+          (grid_thw[0][1] // spatial_merge_size)
+          * (grid_thw[0][2] // spatial_merge_size)
+      ),
       quantize=flags.FLAGS.quantize,
       config=pytorch_model.config.decoder_config,
       export_config=ExportConfig(),
diff --git a/ai_edge_torch/generative/utilities/converter.py b/ai_edge_torch/generative/utilities/converter.py
@@ -57,7 +57,7 @@ def define_conversion_flags(model_name: str):
   )
   flags.DEFINE_string(
       'output_name_prefix',
-      f'{model_name}',
+      model_name,
       'The prefix of the output tflite model name.',
   )
   flags.DEFINE_multi_integer(
@@ -91,6 +91,7 @@ def convert_to_tflite(
     output_name_prefix: str,
     prefill_seq_len: Union[int, list[int]],
     pixel_values_size: torch.Size = None,
+    pixel_seq_len: int = 0,
     quantize: bool = True,
     config: cfg.ModelConfig = None,
     lora_ranks: Optional[list[int]] = None,
@@ -133,12 +134,18 @@ def convert_to_tflite(
         use. If a list, the model will have multiple prefill signatures.
       pixel_values_size (torch.Size, optional): The size of pixel values to pass
         to the model. If None, the model is not expected to take pixel values.
+      pixel_seq_len (int, optional): The length of pixel tokens, or pixel
+        embeddings generated by the image encoder with pixel values. The actual
+        length of prefill_seq_len will be added by pixel_seq_len when pixel
+        values are passed.
       quantize (bool, optional): Whether the model should be quanized. Defaults
         to True.
       config (cfg.ModelConfig, optional): The model config used to configure KV
         cache. If None, it uses the config of the pytorch_model.
       lora_ranks (list[int], optional): The ranks of the LORA layers. If None,
         no LoRA signatures will be added.
+      export_config (ExportConfig, optional): The export configuration. If None,
+        it uses the default export configuration.
   """
   # pylint: disable=protected-access
   torch._dynamo.config.cache_size_limit = 64
@@ -173,6 +180,7 @@ def convert_to_tflite(
       output_file,
       prefill_seq_lens,
       pixel_values_size,
+      pixel_seq_len,
       quantize,
       config,
       loras,
@@ -185,6 +193,7 @@ def _export_helper(
     output_file: str,
     prefill_seq_lens: list[int],
     pixel_values_size: torch.Size,
+    pixel_seq_len: int,
     quantize: bool,
     config: cfg.ModelConfig,
     loras: list[None | lora_utils.LoRA],
@@ -197,11 +206,18 @@ def _export_helper(
     prefill_tokens_list.append(torch.full((1, seq_len), 0, dtype=torch.int))
     prefill_input_pos_list.append(torch.arange(0, seq_len, dtype=torch.int))
 
-  prefill_pixel_values = (
-      torch.full(pixel_values_size, 0, dtype=torch.float32)
-      if pixel_values_size
-      else None
-  )
+  prefill_pixel_values = None
+  prefill_tokens_list_with_pixel = []
+  prefill_input_pos_list_with_pixel = []
+  if pixel_values_size is not None:
+    prefill_pixel_values = torch.full(pixel_values_size, 0, dtype=torch.float32)
+    for seq_len in prefill_seq_lens:
+      prefill_tokens_list_with_pixel.append(
+          torch.full((1, seq_len + pixel_seq_len), 0, dtype=torch.int)
+      )
+      prefill_input_pos_list_with_pixel.append(
+          torch.arange(0, seq_len + pixel_seq_len, dtype=torch.int)
+      )
 
   if export_config.prefill_mask is None:
     prefill_masks = None
@@ -238,13 +254,11 @@ def _export_helper(
   for lora in loras:
     for i in range(len(prefill_seq_lens)):
       prefill_seq_len = prefill_seq_lens[i]
-      prefill_tokens = prefill_tokens_list[i]
-      prefill_input_pos = prefill_input_pos_list[i]
       prefill_signature_name = f'prefill_{prefill_seq_len}'
 
       sample_kwargs = {
-          'tokens': prefill_tokens,
-          'input_pos': prefill_input_pos,
+          'tokens': prefill_tokens_list[i],
+          'input_pos': prefill_input_pos_list[i],
           'kv_cache': prefill_kv,
       }
       if prefill_masks is not None:
@@ -261,13 +275,13 @@ def _export_helper(
       )
 
       if prefill_pixel_values is not None:
+        sample_kwargs['tokens'] = prefill_tokens_list_with_pixel[i]
+        sample_kwargs['input_pos'] = prefill_input_pos_list_with_pixel[i]
+        sample_kwargs['pixel_values'] = prefill_pixel_values
         converter.add_signature(
             prefill_signature_name + '_pixel',
             mod,
-            sample_kwargs={
-                **sample_kwargs,
-                'pixel_values': prefill_pixel_values,
-            },
+            sample_kwargs=sample_kwargs,
         )
 
     sample_kwargs = {