Layer-wise KV Cache Allocation for Models with Alternating Attention Patterns (#1832)

anujj · web-flow · commit d4eabac69d91 · 2025-11-05T15:51:57.000+05:30
## 📋 Problem Statement ### Background Gemma2 models use an alternating attention pattern: - **Even layers (0, 2, 4, ...)**: Full attention (requires full context, e.g., 8K tokens) - **Odd layers (1, 3, 5, ...)**: Sliding window attention (only needs 4K tokens) PR #1523 applied a **uniform** sliding window size (4K) to all layers for NvTensorRtRtx EP. This was incorrect ## ✅ Solution ### Approach 1. **Export layer-specific attention types** during model building 2. **Use different dimension names** in ONNX to satisfy TensorRT constraints: - `past_sequence_length_full` for full attention layers - `past_sequence_length_sliding` for sliding window layers - As the shapes for full attention and sliding attention will be diffrent, trt-rtx expects the diff names 3. **Implement per-layer KV cache allocation** in runtime based on attention type 4. **Allocate optimal memory** for each layer type: Below eg for Gemma - Full attention: 8192 tokens - Sliding window: 4096 tokens **Memory Savings Examples**: ### Gemma2 - **Pattern**: Alternating (every other layer: full, sliding, full, sliding, ...) - **Memory**: 8K (full) vs 4K (sliding) - **Savings**: ~25% reduction (13 full + 13 sliding vs 26 full) ## Benefits for Multiple Models This approach enables significant memory optimization for various model architectures with mixed attention patterns: ### Gemma2 - **Pattern**: Alternating (every other layer: full, sliding, full, sliding, ...) - **Memory**: 8K (full) vs 4K (sliding) - **Savings**: ~25% reduction (13 full + 13 sliding vs 26 full) ### Gemma3-4B - **Pattern**: Every 6th layer uses global attention (5 sliding + 1 full) - **Layers**: 34 layers (29 sliding + 5 full) - **Memory**: 128K (full) vs 1K (sliding) - **Savings**: **~85% reduction** - huge improvement in KV memory requirement ### GPT-OSS - **Pattern**: Alternating global and sliding windows - **Memory**: 131K (full) vs 128 (sliding) - **Savings**: **~99.9% reduction** for sliding layers - huge improvement in KV memory requirement - Enables support for extremely long context windows with manageable memory footprint
diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py
@@ -247,6 +247,9 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
             args.chat_template = '<s>{input}'
         elif model_type.startswith("qwen2"):
             args.chat_template = '<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n'
+        elif model_type.startswith("gemma"):
+            # Gemma and Gemma2 models use this format
+            args.chat_template = '<start_of_turn>user\n{input}<end_of_turn>\n<start_of_turn>model\n'
         else:
             raise ValueError(f"Chat Template for model type {model_type} is not known. Please provide chat template using --chat_template")
 
diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
@@ -146,10 +146,26 @@ def main(args):
 
         generator = og.Generator(model, params)
         if args.verbose: print("Generator created")
-        if guidance_type == "json_schema" or guidance_type == "lark_grammar":
-            messages = f"""[{{"role": "system", "content": "{system_prompt}", "tools": "{prompt_tool_input}"}}, {{"role": "user", "content": "{text}"}}]"""
+        
+        # Create messages with proper JSON encoding
+        # Gemma2 models don't support system role, so we prepend system prompt to user message
+        if model.type == "gemma2":
+            combined_message = f"{system_prompt}\n\n{text}" if system_prompt else text
+            messages_list = [{"role": "user", "content": combined_message}]
+        elif guidance_type == "json_schema" or guidance_type == "lark_grammar":
+            messages_list = [
+                {"role": "system", "content": system_prompt, "tools": prompt_tool_input},
+                {"role": "user", "content": text}
+            ]
         else:
-            messages = f"""[{{"role": "system", "content": "{system_prompt}"}}, {{"role": "user", "content": "{text}"}}]"""
+            messages_list = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": text}
+            ]
+        
+        # Convert to JSON string for tokenizer
+        messages = json.dumps(messages_list)
+        
         # Apply Chat Template
         if model.type == "marian-ssru":
             prompt = text
diff --git a/src/config.cpp b/src/config.cpp
@@ -359,6 +359,17 @@ struct StringArray_Element : JSON::Element {
   std::vector<std::string>& v_;
 };
 
+struct IntArray_Element : JSON::Element {
+  explicit IntArray_Element(std::vector<int>& v) : v_{v} {}
+
+  void OnValue(std::string_view name, JSON::Value value) override {
+    v_.push_back(static_cast<int>(JSON::Get<double>(value)));
+  }
+
+ private:
+  std::vector<int>& v_;
+};
+
 struct StringStringMap_Element : JSON::Element {
   explicit StringStringMap_Element(std::unordered_map<std::string, std::string>& v) : v_{v} {}
 
@@ -470,8 +481,20 @@ struct SlidingWindow_Element : JSON::Element {
     }
   }
 
+  Element& OnArray(std::string_view name) override {
+    if (name == "layers") {
+      // Lazy initialize layers_ when first accessed
+      if (!layers_) {
+        layers_ = std::make_unique<IntArray_Element>(v_->layers);
+      }
+      return *layers_;
+    }
+    throw JSON::unknown_value_error{};
+  }
+
  private:
   std::optional<Config::Model::Decoder::SlidingWindow>& v_;
+  std::unique_ptr<IntArray_Element> layers_;
 };
 
 struct Encoder_Element : JSON::Element {
diff --git a/src/config.h b/src/config.h
@@ -207,6 +207,7 @@ struct Config {
         std::string alignment{"right"};    // The alignment of the window, either "left" or "right"
         bool slide_key_value_cache{true};  // Whether to slide the key-value cache along with the input prompt
         bool slide_inputs{true};           // Whether to slide the input prompt along with the key-value cache
+        std::vector<int> layers;           // Layer indices that use sliding window attention (for models with alternating patterns)
       };
       std::optional<SlidingWindow> sliding_window;
 
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
@@ -6,6 +6,7 @@
 #include "kv_cache.h"
 #include "windowed_kv_cache.h"
 #include "../openvino/interface.h"
+#include <algorithm>
 
 namespace Generators {
 
@@ -175,21 +176,49 @@ DefaultKeyValueCache::DefaultKeyValueCache(State& state)
   }
 
   // Set the size after empty_past_ has been created with 0 for this field
-  if (state.model_.p_device_->GetType() == DeviceType::NvTensorRtRtx &&
-      model_.config_->model.decoder.sliding_window.has_value() &&
+  if (model_.config_->model.decoder.sliding_window.has_value() &&
       model_.config_->model.decoder.sliding_window->window_size > 0) {
-    shape_[2] = std::min(state_.params_->search.max_length,
-                         model_.config_->model.decoder.sliding_window->window_size);
+    const int sliding_window_size = model_.config_->model.decoder.sliding_window->window_size;
+    const int max_length = state_.params_->search.max_length;
+
+    // Check if we need per-layer allocation for models with alternating attention patterns
+    if (!model_.config_->model.decoder.sliding_window->layers.empty()) {
+      // Use per-layer allocation based on sliding window layer indices
+      layer_shapes_.resize(layer_count_);
+
+      // Initialize all layers with base shape and max_length
+      for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
+        layer_shapes_[layer_idx] = shape_;
+        layer_shapes_[layer_idx][2] = max_length;
+      }
+
+      // Update sliding window layers with constrained cache size
+      for (int layer_idx : model_.config_->model.decoder.sliding_window->layers) {
+        layer_shapes_[layer_idx][2] = std::min(max_length, sliding_window_size);
+      }
+      // Set shape_[2] to max of all layer shapes for RewindTo bounds checking
+      shape_[2] = max_length;
+    } else {
+      // Uniform sliding window allocation (backward compatibility)
+      shape_[2] = std::min(max_length, sliding_window_size);
+    }
   } else if (past_present_share_buffer_) {
     shape_[2] = state_.params_->search.max_length;
   }
 
   try {
+    // Allocate KV cache tensors - 2 per layer (key and value)
+    // For per-layer shapes: alternates between key and value for each layer
+    // For uniform shape: all tensors use the same shape
     for (int i = 0; i < layer_count_ * 2; ++i) {
-      presents_.push_back(OrtValue::CreateTensor(Allocator(), shape_, type_));
+      std::array<int64_t, 4> tensor_shape = shape_;
+      if (!layer_shapes_.empty()) {
+        // Per-layer allocation: use layer-specific shape
+        // i/2 gives us the layer index since we have 2 tensors per layer
+        tensor_shape = layer_shapes_[i / 2];
+      }
 
-      // Zero the memory so we don't leak any data from the previous run
-      // WebGPU device has no Zero() implementation yet. Since this zeroing is optional we disable it for WebGPU for now
+      presents_.push_back(OrtValue::CreateTensor(Allocator(), tensor_shape, type_));
       if (Device().GetType() != DeviceType::WEBGPU) {
         ByteWrapTensor(Device(), *presents_.back()).Zero();
       }
@@ -240,10 +269,30 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
     }
   }
 
-  shape_[2] = total_length;
-  for (int i = 0; i < layer_count_ * 2; i++) {
-    presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_);
-    state_.outputs_[output_index_ + i] = presents_[i].get();
+  if (!layer_shapes_.empty()) {
+    // Update per-layer shapes based on total_length, but respect max allocations
+    for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
+      const int max_cache_length = static_cast<int>(layer_shapes_[layer_idx][2]);
+      const int actual_length = std::min(total_length, max_cache_length);
+
+      std::array<int64_t, 4> current_shape = layer_shapes_[layer_idx];
+      current_shape[2] = actual_length;
+
+      // Key tensor
+      presents_[layer_idx * 2] = OrtValue::CreateTensor(Allocator(), current_shape, type_);
+      state_.outputs_[output_index_ + layer_idx * 2] = presents_[layer_idx * 2].get();
+
+      // Value tensor
+      presents_[layer_idx * 2 + 1] = OrtValue::CreateTensor(Allocator(), current_shape, type_);
+      state_.outputs_[output_index_ + layer_idx * 2 + 1] = presents_[layer_idx * 2 + 1].get();
+    }
+  } else {
+    // Uniform shape update (existing behavior)
+    shape_[2] = total_length;
+    for (int i = 0; i < layer_count_ * 2; i++) {
+      presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_);
+      state_.outputs_[output_index_ + i] = presents_[i].get();
+    }
   }
 
   is_first_update_ = false;
@@ -271,39 +320,94 @@ void DefaultKeyValueCache::RewindTo(size_t index) {
 
 template <typename T>
 void DefaultKeyValueCache::RewindPastTensorsTo(size_t index) {
-  assert(index > 0 && shape_[2] >= static_cast<int64_t>(index) && !past_present_share_buffer_);
-  std::array<int64_t, 4> new_shape = shape_;
-  new_shape[2] = static_cast<int>(index);
-  auto batch_x_num_heads = new_shape[0] * new_shape[1];
-  auto new_length_x_head_size = new_shape[2] * new_shape[3];
-  auto old_length_x_head_size = shape_[2] * new_shape[3];
-  shape_[2] = new_shape[2];
-
-  for (int i = 0; i < layer_count_ * 2; i++) {
-    OrtValue& present = *presents_[i];
-    std::unique_ptr<OrtValue> past = OrtValue::CreateTensor(Allocator(), shape_, type_);
+  assert(index > 0 && !past_present_share_buffer_);
+
+  if (!layer_shapes_.empty()) {
+    // Handle per-layer shapes
+    // First validate that index doesn't exceed the global max_length
+    int max_length = static_cast<int>(shape_[2]);  // Set to max_length in constructor
+    if (static_cast<int>(index) > max_length) {
+      throw std::runtime_error("Requested rewind length exceeds max_length.");
+    }
 
-    auto past_span = WrapTensor<T>(Device(), *past);
-    auto present_span = WrapTensor<T>(Device(), present);
+    for (int i = 0; i < layer_count_ * 2; i++) {
+      const int layer_idx = i / 2;
+      const std::array<int64_t, 4> layer_shape = layer_shapes_[layer_idx];
+      const int layer_max_cache = static_cast<int>(layer_shape[2]);
+
+      // For each layer, rewind to min(index, layer's max capacity)
+      // - Full attention layers: min(index, max_length)
+      // - Sliding window layers: min(index, sliding_window_size)
+      const int actual_rewind_length = std::min(static_cast<int>(index), layer_max_cache);
+
+      std::array<int64_t, 4> new_shape = layer_shape;
+      new_shape[2] = actual_rewind_length;
+      const auto batch_x_num_heads = new_shape[0] * new_shape[1];
+      const auto new_length_x_head_size = new_shape[2] * new_shape[3];
+
+      OrtValue& present = *presents_[i];
+      const auto present_shape = present.GetTensorTypeAndShapeInfo()->GetShape();
+      const auto old_length_x_head_size = present_shape[2] * new_shape[3];
+
+      std::unique_ptr<OrtValue> past = OrtValue::CreateTensor(Allocator(), new_shape, type_);
+      auto past_span = WrapTensor<T>(Device(), *past);
+      auto present_span = WrapTensor<T>(Device(), present);
+
+      for (int j = 0; j < batch_x_num_heads; j++) {
+        auto present_data = present_span.subspan(j * old_length_x_head_size, new_length_x_head_size);
+        auto past_data = past_span.subspan(j * new_length_x_head_size, new_length_x_head_size);
+        past_data.CopyFrom(present_data);
+      }
+      pasts_[i] = std::move(past);
+      state_.inputs_[input_index_ + i] = pasts_[i].get();
+    }
+  } else {
+    // Uniform shape handling (existing behavior)
+    assert(shape_[2] >= static_cast<int64_t>(index));
+    std::array<int64_t, 4> new_shape = shape_;
+    new_shape[2] = static_cast<int>(index);
+    auto batch_x_num_heads = new_shape[0] * new_shape[1];
+    auto new_length_x_head_size = new_shape[2] * new_shape[3];
+    auto old_length_x_head_size = shape_[2] * new_shape[3];
+    shape_[2] = new_shape[2];
 
-    for (int j = 0; j < batch_x_num_heads; j++) {
-      auto present_data = present_span.subspan(j * old_length_x_head_size, new_length_x_head_size);
-      auto past_data = past_span.subspan(j * new_length_x_head_size, new_length_x_head_size);
-      past_data.CopyFrom(present_data);
+    for (int i = 0; i < layer_count_ * 2; i++) {
+      OrtValue& present = *presents_[i];
+      std::unique_ptr<OrtValue> past = OrtValue::CreateTensor(Allocator(), shape_, type_);
+
+      auto past_span = WrapTensor<T>(Device(), *past);
+      auto present_span = WrapTensor<T>(Device(), present);
+
+      for (int j = 0; j < batch_x_num_heads; j++) {
+        auto present_data = present_span.subspan(j * old_length_x_head_size, new_length_x_head_size);
+        auto past_data = past_span.subspan(j * new_length_x_head_size, new_length_x_head_size);
+        past_data.CopyFrom(present_data);
+      }
+      pasts_[i] = std::move(past);
+      state_.inputs_[input_index_ + i] = pasts_[i].get();
     }
-    pasts_[i] = std::move(past);
-    state_.inputs_[input_index_ + i] = pasts_[i].get();
   }
 }
 
 // Copy present state to past state reordered by the beam_indices
 template <typename ScoreType>
 void DefaultKeyValueCache::PickPastState(DeviceSpan<int32_t> beam_indices_device, int index) {
   std::span<int32_t> beam_indices = beam_indices_device.CopyDeviceToCpu();
-  auto block_size_per_beam = shape_[1] * shape_[2] * shape_[3];
+
+  std::array<int64_t, 4> tensor_shape;
+  if (!layer_shapes_.empty()) {
+    // Get shape from the actual tensor for per-layer allocation
+    OrtValue& present_value = *presents_[index];
+    const auto present_shape = present_value.GetTensorTypeAndShapeInfo()->GetShape();
+    std::copy(present_shape.begin(), present_shape.end(), tensor_shape.begin());
+  } else {
+    tensor_shape = shape_;
+  }
+
+  auto block_size_per_beam = tensor_shape[1] * tensor_shape[2] * tensor_shape[3];
 
   OrtValue& present_value = *presents_[index];
-  std::unique_ptr<OrtValue> past_value = OrtValue::CreateTensor<ScoreType>(Allocator(), shape_);
+  std::unique_ptr<OrtValue> past_value = OrtValue::CreateTensor<ScoreType>(Allocator(), tensor_shape);
 
   auto past_span = WrapTensor<ScoreType>(Device(), *past_value);
   auto present_span = WrapTensor<ScoreType>(Device(), present_value);
diff --git a/src/models/kv_cache.h b/src/models/kv_cache.h
@@ -97,6 +97,9 @@ struct DefaultKeyValueCache : KeyValueCache {
   std::array<int64_t, 4> shape_;
   ONNXTensorElementDataType type_;
 
+  // Support for per-layer KV cache shapes (for models with alternating attention patterns)
+  std::vector<std::array<int64_t, 4>> layer_shapes_;
+
   std::unique_ptr<OrtValue> empty_past_;
   std::vector<std::unique_ptr<OrtValue>> pasts_, presents_;
   std::vector<std::string> input_name_strings_, output_name_strings_;
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -464,7 +464,15 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
         }
 
         if self.ep == "trt-rtx" and self.window_size is not None and self.window_size > 0:
-            genai_config["model"]["decoder"]["sliding_window"] = {"window_size": self.window_size, "slide_key_value_cache": False, "slide_inputs": False}
+            # Compute layer indices that use sliding window attention
+            layer_idxs = [layer_id for layer_id in range(self.num_layers) if hasattr(self, "is_local") and self.is_local(layer_id)]
+            
+            genai_config["model"]["decoder"]["sliding_window"] = {
+                "window_size": self.window_size,
+                "slide_key_value_cache": False,
+                "slide_inputs": False,
+                "layers": layer_idxs
+            }
 
         if self.ep != "cpu":
             ep_name = self.ep.replace("trt-rtx", "NvTensorRtRtx")
@@ -475,6 +483,15 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
         with open(os.path.join(out_dir,"genai_config.json"), "w") as f:
             json.dump(genai_config, f, indent=4)
 
+    def make_key_value_cache_shape(self, layer_id, shape):
+        """
+        Modifies KV cache shape dimension names for models with alternating attention patterns.
+        For TensorRT EP with sliding window layers, replaces 'sequence' with 'sliding' in dimension name.
+        """
+        if self.ep == "trt-rtx" and hasattr(self, "is_local") and self.is_local(layer_id):
+            return [shape[0], shape[1], shape[2].replace("sequence", "sliding"), shape[3]]
+        return shape
+
     def save_processing(self, model_name_or_path, extra_kwargs, out_dir):
         tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=self.hf_remote, **extra_kwargs)
         print(f"Saving processing files in {out_dir} for GenAI")
@@ -653,15 +670,21 @@ def make_inputs_and_outputs(self):
         for i in range(self.num_layers):
             # Add KV cache to inputs
             key_name = f"past_key_values.{i}.key"
-            inputs.append(self.make_value(key_name, dtype=self.input_types["past_key_values.key"], shape=self.input_shapes["past_key_values.key"]))
+            key_shape = self.make_key_value_cache_shape(i, self.input_shapes["past_key_values.key"])
+            inputs.append(self.make_value(key_name, dtype=self.input_types["past_key_values.key"], shape=key_shape))
+
             value_name = f"past_key_values.{i}.value"
-            inputs.append(self.make_value(value_name, dtype=self.input_types["past_key_values.value"], shape=self.input_shapes["past_key_values.value"]))
+            value_shape = self.make_key_value_cache_shape(i, self.input_shapes["past_key_values.value"])
+            inputs.append(self.make_value(value_name, dtype=self.input_types["past_key_values.value"], shape=value_shape))
 
             # Add KV cache to outputs
             key_name = f"present.{i}.key"
-            outputs.append(self.make_value(key_name, dtype=self.output_types["present.key"], shape=self.output_shapes["present.key"]))
+            key_shape = self.make_key_value_cache_shape(i, self.output_shapes["present.key"])
+            outputs.append(self.make_value(key_name, dtype=self.output_types["present.key"], shape=key_shape))
+
             value_name = f"present.{i}.value"
-            outputs.append(self.make_value(value_name, dtype=self.output_types["present.value"], shape=self.output_shapes["present.value"]))
+            value_shape = self.make_key_value_cache_shape(i, self.output_shapes["present.value"])
+            outputs.append(self.make_value(value_name, dtype=self.output_types["present.value"], shape=value_shape))
 
     def make_constant(self, name):
         # Make constant ops for 0, 1, 2, 3, etc.