refactor: Simplify builder.py per review - use layer indices instead of type array

anujj · anujj · commit 6fbb95fbe4d6 · 2025-10-22T13:44:25.000+05:30
diff --git a/src/config.cpp b/src/config.cpp
@@ -364,6 +364,17 @@ struct StringArray_Element : JSON::Element {
   std::vector<std::string>& v_;
 };
 
+struct IntArray_Element : JSON::Element {
+  explicit IntArray_Element(std::vector<int>& v) : v_{v} {}
+
+  void OnValue(std::string_view name, JSON::Value value) override {
+    v_.push_back(static_cast<int>(JSON::Get<double>(value)));
+  }
+
+ private:
+  std::vector<int>& v_;
+};
+
 struct StringStringMap_Element : JSON::Element {
   explicit StringStringMap_Element(std::unordered_map<std::string, std::string>& v) : v_{v} {}
 
@@ -469,15 +480,19 @@ struct SlidingWindow_Element : JSON::Element {
   }
 
   Element& OnArray(std::string_view name) override {
-    if (name == "layer_types") {
-      return layer_types_;
+    if (name == "layers") {
+      // Lazy initialize layers_ when first accessed
+      if (!layers_) {
+        layers_ = std::make_unique<IntArray_Element>(v_->layers);
+      }
+      return *layers_;
     }
     throw JSON::unknown_value_error{};
   }
 
  private:
   std::optional<Config::Model::Decoder::SlidingWindow>& v_;
-  StringArray_Element layer_types_{v_->layer_types};
+  std::unique_ptr<IntArray_Element> layers_;
 };
 
 struct Encoder_Element : JSON::Element {
diff --git a/src/config.h b/src/config.h
@@ -200,7 +200,7 @@ struct Config {
         std::string alignment{"right"};    // The alignment of the window, either "left" or "right"
         bool slide_key_value_cache{true};  // Whether to slide the key-value cache along with the input prompt
         bool slide_inputs{true};           // Whether to slide the input prompt along with the key-value cache
-        std::vector<std::string> layer_types;  // Layer-specific attention types: "full_attention" or "sliding_attention"
+        std::vector<int> layers;           // Layer indices that use sliding window attention (for models with alternating patterns)
       };
       std::optional<SlidingWindow> sliding_window;
 
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
@@ -6,6 +6,7 @@
 #include "kv_cache.h"
 #include "windowed_kv_cache.h"
 #include "../openvino/interface.h"
+#include <unordered_set>
 
 namespace Generators {
 
@@ -179,21 +180,27 @@ DefaultKeyValueCache::DefaultKeyValueCache(State& state)
   if (state.model_.p_device_->GetType() == DeviceType::NvTensorRtRtx &&
       model_.config_->model.decoder.sliding_window.has_value() &&
       model_.config_->model.decoder.sliding_window->window_size > 0 &&
-      !model_.config_->model.decoder.sliding_window->layer_types.empty()) {
-    // Use per-layer allocation based on layer_types
+      !model_.config_->model.decoder.sliding_window->layers.empty()) {
+    // Use per-layer allocation based on sliding window layer indices
     use_layer_types_ = true;
     layer_shapes_.resize(layer_count_);
     
     int sliding_window_size = model_.config_->model.decoder.sliding_window->window_size;
     int max_length = state_.params_->search.max_length;
     
+    // Create a set of sliding window layer indices for fast lookup
+    std::unordered_set<int> sliding_layers(
+        model_.config_->model.decoder.sliding_window->layers.begin(),
+        model_.config_->model.decoder.sliding_window->layers.end());
+
     for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
       layer_shapes_[layer_idx] = shape_;  // Copy base shape
       
-      const std::string& layer_type = model_.config_->model.decoder.sliding_window->layer_types[layer_idx];
-      if (layer_type == "sliding_attention") {
+      if (sliding_layers.count(layer_idx) > 0) {
+        // Sliding window layer
         layer_shapes_[layer_idx][2] = std::min(max_length, sliding_window_size);
-      } else {  // "full_attention"
+      } else {
+        // Full attention layer
         layer_shapes_[layer_idx][2] = max_length;
       }
     }
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -465,10 +465,12 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
         if self.ep == "trt-rtx" and self.window_size is not None and self.window_size > 0:
             genai_config["model"]["decoder"]["sliding_window"] = {"window_size": self.window_size, "slide_key_value_cache": False, "slide_inputs": False}
             
-            # Add layer-specific attention types if model has alternating attention patterns
+            # Add layer indices for sliding window layers if model has alternating attention patterns
             layer_types = self.get_layer_types()
             if layer_types is not None:
-                genai_config["model"]["decoder"]["sliding_window"]["layer_types"] = layer_types
+                # Export list of layer indices that use sliding window attention
+                sliding_layers = [i for i, lt in enumerate(layer_types) if lt == "sliding_attention"]
+                genai_config["model"]["decoder"]["sliding_window"]["layers"] = sliding_layers
 
         if self.ep != "cpu":
             ep_name = self.ep.replace("trt-rtx", "NvTensorRtRtx")
@@ -487,16 +489,14 @@ def get_layer_types(self):
         """
         return None
 
-    def use_alternating_kv_dimensions(self):
+    def make_kv_value_cache_shape(self, layer_id, shape):
         """
-        Returns True if this model needs alternating KV cache dimension names.
-        This is needed for models with alternating attention patterns when using TensorRT.
+        Modifies KV cache shape dimension names for models with alternating attention patterns.
+        For TensorRT EP with sliding window layers, replaces 'sequence' with 'sliding' in dimension name.
         """
-        # Enable for models with layer_types when using TensorRT EP
-        if self.ep == "trt-rtx" and hasattr(self, 'get_layer_types'):
-            layer_types = self.get_layer_types()
-            return layer_types is not None
-        return False
+        if self.ep == "trt-rtx" and hasattr(self, "is_local") and self.is_local(layer_id):
+            return [shape[0], shape[1], shape[2].replace("sequence", "sliding"), shape[3]]
+        return shape
 
     def save_processing(self, model_name_or_path, extra_kwargs, out_dir):
         tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=self.hf_remote, **extra_kwargs)
@@ -674,39 +674,23 @@ def make_inputs_and_outputs(self):
 
         # Add KV cache to inputs and outputs
         for i in range(self.num_layers):
-            # Use alternating dimension names if needed (for TensorRT with alternating attention)
-            if self.use_alternating_kv_dimensions():
-                layer_types = self.get_layer_types()
-                layer_type = layer_types[i] if layer_types and i < len(layer_types) else "full_attention"
-                
-                # Use dimension name based on attention type
-                if layer_type == "sliding_attention":
-                    dim_suffix = "_sliding"
-                else:  # "full_attention"
-                    dim_suffix = "_full"
-                
-                past_key_shape = ["batch_size", self.num_kv_heads, f"past_sequence_length{dim_suffix}", self.head_size]
-                past_value_shape = ["batch_size", self.num_kv_heads, f"past_sequence_length{dim_suffix}", self.head_size]
-                present_key_shape = ["batch_size", self.num_kv_heads, f"total_sequence_length{dim_suffix}", self.head_size]
-                present_value_shape = ["batch_size", self.num_kv_heads, f"total_sequence_length{dim_suffix}", self.head_size]
-            else:
-                # Use standard dimension names (current behavior)
-                past_key_shape = self.input_shapes["past_key_values.key"]
-                past_value_shape = self.input_shapes["past_key_values.value"]
-                present_key_shape = self.output_shapes["present.key"]
-                present_value_shape = self.output_shapes["present.value"]
-            
             # Add KV cache to inputs
             key_name = f"past_key_values.{i}.key"
-            inputs.append(self.make_value(key_name, dtype=self.input_types["past_key_values.key"], shape=past_key_shape))
+            key_shape = self.make_kv_value_cache_shape(i, self.input_shapes["past_key_values.key"])
+            inputs.append(self.make_value(key_name, dtype=self.input_types["past_key_values.key"], shape=key_shape))
+
             value_name = f"past_key_values.{i}.value"
-            inputs.append(self.make_value(value_name, dtype=self.input_types["past_key_values.value"], shape=past_value_shape))
+            value_shape = self.make_kv_value_cache_shape(i, self.input_shapes["past_key_values.value"])
+            inputs.append(self.make_value(value_name, dtype=self.input_types["past_key_values.value"], shape=value_shape))
 
             # Add KV cache to outputs
             key_name = f"present.{i}.key"
-            outputs.append(self.make_value(key_name, dtype=self.output_types["present.key"], shape=present_key_shape))
+            key_shape = self.make_kv_value_cache_shape(i, self.output_shapes["present.key"])
+            outputs.append(self.make_value(key_name, dtype=self.output_types["present.key"], shape=key_shape))
+
             value_name = f"present.{i}.value"
-            outputs.append(self.make_value(value_name, dtype=self.output_types["present.value"], shape=present_value_shape))
+            value_shape = self.make_kv_value_cache_shape(i, self.output_shapes["present.value"])
+            outputs.append(self.make_value(value_name, dtype=self.output_types["present.value"], shape=value_shape))
 
     def make_constant(self, name):
         # Make constant ops for 0, 1, 2, 3, etc.