Mpasumarthi/nvtrt test suite (#1756)

mpasumarthi-git · web-flow · commit 9d3631dbcf43 · 2025-11-07T08:55:05.000-08:00
Extend GenAI test suite to support NvTensorRtRtx EP by switching to a
newer model and adding deterministic tests for API flow, batch size &gt; 1,
sampling, and out-of-place KV cache.
diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml
@@ -27,7 +27,9 @@ jobs:
 
       - uses: actions/setup-dotnet@v5
         with:
-          dotnet-version: '8.0.x'
+          dotnet-version: |
+            8.0.x
+            9.0.x
 
       - name: Install Rust with rustup
         run: |
diff --git a/test/model_tests.cpp b/test/model_tests.cpp
@@ -4,6 +4,7 @@
 #include <cstring>  // for memcmp
 #include <iostream>
 #include <random>
+#include <filesystem>
 #include <gtest/gtest.h>
 
 #include "span.h"
@@ -249,6 +250,104 @@ TEST(ModelTests, BeamSearchGptCuda) {
 }
 #endif
 
+// NvTensorRT test cases using Phi3 models
+static const std::pair<const char*, const char*> c_phi3_nvtrt_model_paths[] = {
+    {MODEL_PATH "hf-internal-testing/phi3-fp16-nvtrt", "fp16"},
+};
+
+void Test_GreedySearch_Phi3_NvTensorRtRtx(const char* model_path, const char* model_label) {
+  // Skip test if NvTensorRT model is not available
+  if (!std::filesystem::exists(model_path)) {
+    GTEST_SKIP() << "NvTensorRT model not available at: " << model_path;
+  }
+  const std::vector<int64_t> input_ids_shape{1, 19};
+  const std::vector<int32_t> input_ids{32006, 887, 526, 263, 8444, 29871, 23869, 20255, 29889, 32007, 32010, 6324, 29892, 1128, 526, 366, 29973, 32007, 32001};
+
+  // Complete expected sequence (input + generated) from model_qa.cpp using the working phi3-fp16-nvtrt model
+  const std::vector<int32_t> expected_output{
+      32006, 887, 526, 263, 8444, 29871, 23869, 20255, 29889, 32007, 32010, 6324, 29892, 1128, 526, 366, 29973, 32007, 32001,  // Input tokens (19)
+      15043, 29991, 306, 29915, 29885, 2599};
+  auto config = OgaConfig::Create(model_path);
+  config->ClearProviders();
+  config->AppendProvider("NvTensorRtRtx");
+  auto model = OgaModel::Create(*config);
+
+  constexpr int max_length = 25;
+  int batch_size = static_cast<int>(input_ids_shape[0]);
+  auto params = OgaGeneratorParams::Create(*model);
+  params->SetSearchOption("max_length", max_length);
+  params->SetSearchOption("batch_size", batch_size);
+
+  auto generator = OgaGenerator::Create(*model, *params);
+  generator->AppendTokens(input_ids);
+
+  while (!generator->IsDone()) {
+    generator->GenerateNextToken();
+  }
+
+  // Verify outputs match expected outputs
+  for (int i = 0; i < batch_size; i++) {
+    auto sequence = generator->GetSequence(i);
+    auto* expected_output_start = &expected_output[i * max_length];
+
+    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t)));
+  }
+}
+
+TEST(ModelTests, GreedySearchPhi3NvTensorRtRtx) {
+  for (auto model_path : c_phi3_nvtrt_model_paths)
+    Test_GreedySearch_Phi3_NvTensorRtRtx(model_path.first, model_path.second);
+}
+
+void Test_OutOfPlaceKvCache_Phi3_NvTensorRtRtx(const char* model_path, const char* model_label) {
+  // Skip test if NvTensorRT model is not available
+  if (!std::filesystem::exists(model_path)) {
+    GTEST_SKIP() << "NvTensorRT model not available at: " << model_path;
+  }
+
+  const std::vector<int64_t> input_ids_shape{1, 19};
+  const std::vector<int32_t> input_ids{
+      32006, 887, 526, 263, 8444, 29871, 23869, 20255, 29889,
+      32007, 32010, 6324, 29892, 1128, 526, 366, 29973, 32007, 32001};
+
+  // Expected output sequence (input + generated tokens) for validation with greedy search
+  const std::vector<int32_t> expected_output{
+      32006, 887, 526, 263, 8444, 29871, 23869, 20255, 29889, 32007, 32010, 6324, 29892, 1128, 526, 366, 29973, 32007, 32001,  // Input tokens (19)
+      15043, 1554, 13, 16271, 29892, 8733};
+
+  auto config = OgaConfig::Create(model_path);
+  config->ClearProviders();
+  config->AppendProvider("NvTensorRtRtx");
+  auto model = OgaModel::Create(*config);
+
+  constexpr int max_length = 25;
+  int batch_size = static_cast<int>(input_ids_shape[0]);
+  auto params = OgaGeneratorParams::Create(*model);
+  params->SetSearchOption("max_length", max_length);
+  params->SetSearchOption("batch_size", batch_size);
+  params->SetSearchOptionBool("past_present_share_buffer", false);
+  params->SetSearchOptionBool("do_sample", false);
+
+  auto generator = OgaGenerator::Create(*model, *params);
+  generator->AppendTokens(input_ids);
+
+  while (!generator->IsDone()) {
+    generator->GenerateNextToken();
+  }
+
+  auto sequence = generator->GetSequence(0);
+
+  // Verify output matches expected output
+  EXPECT_EQ(sequence.size(), expected_output.size());
+  EXPECT_TRUE(0 == std::memcmp(expected_output.data(), sequence.data(),
+                               expected_output.size() * sizeof(int32_t)));
+}
+
+TEST(ModelTests, OutOfPlaceKvCachePhi3NvTensorRtRtx) {
+  for (auto model_path : c_phi3_nvtrt_model_paths)
+    Test_OutOfPlaceKvCache_Phi3_NvTensorRtRtx(model_path.first, model_path.second);
+}
+
 #if TEST_PHI2 && (USE_CUDA || USE_DML)
 TEST(ModelTests, TestApiDevice) {
   auto prompt = R"(
diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp
@@ -7,6 +7,7 @@
 #include <array>
 #include <chrono>
 #include <cmath>
+#include <filesystem>
 #include <iomanip>
 #include <iostream>
 #include <numeric>
@@ -108,7 +109,11 @@ void PrintSummary(const std::vector<BenchmarkResult>& results) {
 }
 
 BenchmarkResult RunBenchmark(const BenchmarkParams& params) {
-  auto config = OgaConfig::Create(MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  const char* model_path = MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32";
+  if (strcmp(params.device_type, "NvTensorRtRtx") == 0) {
+    model_path = MODEL_PATH "hf-internal-testing/phi3-fp16-nvtrt";
+  }
+  auto config = OgaConfig::Create(model_path);
   std::string overlay = R"({ "model": { "vocab_size" : )" + std::to_string(params.vocab_size) + R"( } })";
   config->Overlay(overlay.c_str());
   config->ClearProviders();
@@ -178,6 +183,10 @@ TEST(SamplingBenchmarks, PerformanceTests) {
 #if USE_CUDA
   device_types.push_back("cuda");
 #endif
+  // Add NvTensorRtRtx if model is available
+  if (std::filesystem::exists(MODEL_PATH "hf-internal-testing/phi3-fp16-nvtrt")) {
+    device_types.push_back("NvTensorRtRtx");
+  }
 
   std::vector<int> batch_sizes = {1};
   std::vector<int> vocab_sizes = {201088};
diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp