Skip to content

Commit 4043be2

Browse files
committed
Add optimization_level to TTConfig for vLLM compile
- Add optimization_level field to TTConfig with default value 0 - Propagate optimization_level through get_pjrt_compile_config() - Set optimization_level = 1 for Qwen models in batched inference test
1 parent 106b606 commit 4043be2

File tree

2 files changed

+14
-4
lines changed

2 files changed

+14
-4
lines changed

integrations/vllm_plugin/vllm_tt/platform.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,13 @@ class TTConfig:
4848
# TPU model loader to share the model across multiple devices.
4949
enable_tensor_parallel: bool = False
5050

51+
# Optimization level for tt-mlir compilation.
52+
optimization_level: int = 0
53+
5154
def get_pjrt_compile_config(self) -> dict:
5255
return {
5356
"enable_const_eval": self.enable_const_eval,
57+
"optimization_level": self.optimization_level,
5458
}
5559

5660

tests/integrations/vllm_plugin/pooling/test_batched_inference.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,17 @@
1111
@pytest.mark.push
1212
@pytest.mark.single_device
1313
@pytest.mark.parametrize(
14-
["model_name", "baseline_path"],
14+
["model_name", "baseline_path", "optimization_level"],
1515
[
1616
pytest.param(
1717
"BAAI/bge-m3",
1818
"baseline/bge_m3_baseline.pt",
19+
0,
1920
),
2021
pytest.param(
2122
"Qwen/Qwen3-Embedding-0.6B",
2223
"baseline/qwen3_embedding_0.6B_baseline.pt",
24+
1,
2325
),
2426
],
2527
)
@@ -33,6 +35,7 @@
3335
def test_batched_inference(
3436
model_name: str,
3537
baseline_path: str,
38+
optimization_level: int,
3639
batch_size: int,
3740
max_num_seqs: int,
3841
max_num_batched_tokens: int,
@@ -57,6 +60,11 @@ def test_batched_inference(
5760
"We build computers for AI. We design Graph Processors, high-performance RISC CPUs, and configurable chips that run our robust software stack.",
5861
"The capital of France is Paris",
5962
]
63+
additional_config = {
64+
"batch_size": batch_size,
65+
"optimization_level": optimization_level,
66+
}
67+
6068
llm_args = {
6169
"model": model_name,
6270
"task": "embed",
@@ -65,9 +73,7 @@ def test_batched_inference(
6573
"disable_sliding_window": True,
6674
"max_num_batched_tokens": max_num_batched_tokens,
6775
"max_num_seqs": max_num_seqs,
68-
"additional_config": {
69-
"batch_size": batch_size,
70-
},
76+
"additional_config": additional_config,
7177
}
7278
model = vllm.LLM(**llm_args)
7379

0 commit comments

Comments
 (0)