Fix tests and improve CI (#11614)

guangy10 · web-flow · commit df4c12e59966 · 2025-06-12T14:04:04.000-07:00
### Summary
Fix broken test `test_gather_benchmark_configs` in the CI. Ensure the
tests will be triggered when the config script are modified.

### Test plan
`python .ci/scripts/tests/test_gather_benchmark_configs.py`
```

----------------------------------------------------------------------
Ran 14 tests in 0.156s

OK`
```
diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py
@@ -135,12 +135,11 @@ def generate_compatible_configs(model_name: str, target_os=None) -> List[str]:
             # etLLM recipes for Llama
             repo_name = model_name.split("meta-llama/")[1]
             if "qlora" in repo_name.lower():
-                configs.append("llama3_qlora")
+                configs = ["llama3_qlora"]
             elif "spinquant" in repo_name.lower():
-                configs.append("llama3_spinquant")
+                configs = ["llama3_spinquant"]
             else:
-                configs.append("llama3_fb16")
-                configs.append("et_xnnpack_custom_spda_kv_cache_8da4w")
+                configs.extend(["llama3_fb16", "et_xnnpack_custom_spda_kv_cache_8da4w"])
                 configs.extend(
                     [
                         config
diff --git a/.ci/scripts/tests/test_gather_benchmark_configs.py b/.ci/scripts/tests/test_gather_benchmark_configs.py
@@ -112,15 +112,24 @@ def test_generate_compatible_configs_llama_model(self):
         result = self.gather_benchmark_configs.generate_compatible_configs(
             model_name, target_os
         )
-        expected = ["llama3_fb16", "llama3_coreml_ane"]
-        self.assertEqual(result, expected)
+        expected = [
+            "llama3_fb16",
+            "llama3_coreml_ane",
+            "et_xnnpack_custom_spda_kv_cache_8da4w",
+            "hf_xnnpack_custom_spda_kv_cache_8da4w",
+        ]
+        self.assertCountEqual(result, expected)
 
         target_os = "android"
         result = self.gather_benchmark_configs.generate_compatible_configs(
             model_name, target_os
         )
-        expected = ["llama3_fb16"]
-        self.assertEqual(result, expected)
+        expected = [
+            "llama3_fb16",
+            "et_xnnpack_custom_spda_kv_cache_8da4w",
+            "hf_xnnpack_custom_spda_kv_cache_8da4w",
+        ]
+        self.assertCountEqual(result, expected)
 
     def test_generate_compatible_configs_quantized_llama_model(self):
         model_name = "meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8"
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -6,12 +6,14 @@ on:
   pull_request:
     paths:
       - .github/workflows/android-perf.yml
+      - .ci/scripts/gather_benchmark_configs.py
       - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
   push:
     branches:
       - main
     paths:
       - .github/workflows/android-perf.yml
+      - .ci/scripts/gather_benchmark_configs.py
       - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
   # Note: GitHub has an upper limit of 10 inputs
   workflow_dispatch:
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
@@ -6,12 +6,14 @@ on:
   pull_request:
     paths:
       - .github/workflows/apple-perf.yml
+      - .ci/scripts/gather_benchmark_configs.py
       - extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
   push:
     branches:
       - main
     paths:
       - .github/workflows/apple-perf.yml
+      - .ci/scripts/gather_benchmark_configs.py
       - extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
   # Note: GitHub has an upper limit of 10 inputs
   workflow_dispatch: