Add optimization_level to TTConfig for vLLM compile

odjuricicTT · odjuricicTT · commit 4043be251eb1 · 2025-12-23T15:45:33.000Z
- Add optimization_level field to TTConfig with default value 0
- Propagate optimization_level through get_pjrt_compile_config()
- Set optimization_level = 1 for Qwen models in batched inference test
diff --git a/integrations/vllm_plugin/vllm_tt/platform.py b/integrations/vllm_plugin/vllm_tt/platform.py
@@ -48,9 +48,13 @@ class TTConfig:
     # TPU model loader to share the model across multiple devices.
     enable_tensor_parallel: bool = False
 
+    # Optimization level for tt-mlir compilation.
+    optimization_level: int = 0
+
     def get_pjrt_compile_config(self) -> dict:
         return {
             "enable_const_eval": self.enable_const_eval,
+            "optimization_level": self.optimization_level,
         }
 
 
diff --git a/tests/integrations/vllm_plugin/pooling/test_batched_inference.py b/tests/integrations/vllm_plugin/pooling/test_batched_inference.py
@@ -11,15 +11,17 @@
 @pytest.mark.push
 @pytest.mark.single_device
 @pytest.mark.parametrize(
-    ["model_name", "baseline_path"],
+    ["model_name", "baseline_path", "optimization_level"],
     [
         pytest.param(
             "BAAI/bge-m3",
             "baseline/bge_m3_baseline.pt",
+            0,
         ),
         pytest.param(
             "Qwen/Qwen3-Embedding-0.6B",
             "baseline/qwen3_embedding_0.6B_baseline.pt",
+            1,
         ),
     ],
 )
@@ -33,6 +35,7 @@
 def test_batched_inference(
     model_name: str,
     baseline_path: str,
+    optimization_level: int,
     batch_size: int,
     max_num_seqs: int,
     max_num_batched_tokens: int,
@@ -57,6 +60,11 @@ def test_batched_inference(
         "We build computers for AI. We design Graph Processors, high-performance RISC CPUs, and configurable chips that run our robust software stack.",
         "The capital of France is Paris",
     ]
+    additional_config = {
+        "batch_size": batch_size,
+        "optimization_level": optimization_level,
+    }
+
     llm_args = {
         "model": model_name,
         "task": "embed",
@@ -65,9 +73,7 @@ def test_batched_inference(
         "disable_sliding_window": True,
         "max_num_batched_tokens": max_num_batched_tokens,
         "max_num_seqs": max_num_seqs,
-        "additional_config": {
-            "batch_size": batch_size,
-        },
+        "additional_config": additional_config,
     }
     model = vllm.LLM(**llm_args)