Impliment interleaved KV cache managment for local and global kv

anujj · anujj · commit 6d5fd8445d06 · 2025-10-15T15:16:18.000+05:30
diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py
@@ -247,6 +247,9 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
             args.chat_template = '<s>{input}'
         elif model_type.startswith("qwen2"):
             args.chat_template = '<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n'
+        elif model_type.startswith("gemma"):
+            # Gemma and Gemma2 models use this format
+            args.chat_template = '<start_of_turn>user\n{input}<end_of_turn>\n<start_of_turn>model\n'
         else:
             raise ValueError(f"Chat Template for model type {model_type} is not known. Please provide chat template using --chat_template")
 
diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
@@ -146,10 +146,26 @@ def main(args):
 
         generator = og.Generator(model, params)
         if args.verbose: print("Generator created")
-        if guidance_type == "json_schema" or guidance_type == "lark_grammar":
-            messages = f"""[{{"role": "system", "content": "{system_prompt}", "tools": "{prompt_tool_input}"}}, {{"role": "user", "content": "{text}"}}]"""
+        
+        # Create messages with proper JSON encoding
+        # Gemma2 models don't support system role, so we prepend system prompt to user message
+        if model.type == "gemma2":
+            combined_message = f"{system_prompt}\n\n{text}" if system_prompt else text
+            messages_list = [{"role": "user", "content": combined_message}]
+        elif guidance_type == "json_schema" or guidance_type == "lark_grammar":
+            messages_list = [
+                {"role": "system", "content": system_prompt, "tools": prompt_tool_input},
+                {"role": "user", "content": text}
+            ]
         else:
-            messages = f"""[{{"role": "system", "content": "{system_prompt}"}}, {{"role": "user", "content": "{text}"}}]"""
+            messages_list = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": text}
+            ]
+        
+        # Convert to JSON string for tokenizer
+        messages = json.dumps(messages_list)
+        
         # Apply Chat Template
         if model.type == "marian-ssru":
             prompt = text
diff --git a/src/config.cpp b/src/config.cpp
@@ -468,8 +468,16 @@ struct SlidingWindow_Element : JSON::Element {
     }
   }
 
+  Element& OnArray(std::string_view name) override {
+    if (name == "layer_types") {
+      return layer_types_;
+    }
+    throw JSON::unknown_value_error{};
+  }
+
  private:
   std::optional<Config::Model::Decoder::SlidingWindow>& v_;
+  StringArray_Element layer_types_{v_->layer_types};
 };
 
 struct Encoder_Element : JSON::Element {
diff --git a/src/config.h b/src/config.h
@@ -200,6 +200,7 @@ struct Config {
         std::string alignment{"right"};    // The alignment of the window, either "left" or "right"
         bool slide_key_value_cache{true};  // Whether to slide the key-value cache along with the input prompt
         bool slide_inputs{true};           // Whether to slide the input prompt along with the key-value cache
+        std::vector<std::string> layer_types;  // Layer-specific attention types: "full_attention" or "sliding_attention"
       };
       std::optional<SlidingWindow> sliding_window;
 
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
@@ -175,23 +175,64 @@ DefaultKeyValueCache::DefaultKeyValueCache(State& state)
   }
 
   // Set the size after empty_past_ has been created with 0 for this field
+  // Check if we need to use per-layer allocation for models with alternating attention patterns
   if (state.model_.p_device_->GetType() == DeviceType::NvTensorRtRtx &&
       model_.config_->model.decoder.sliding_window.has_value() &&
-      model_.config_->model.decoder.sliding_window->window_size > 0) {
+      model_.config_->model.decoder.sliding_window->window_size > 0 &&
+      !model_.config_->model.decoder.sliding_window->layer_types.empty()) {
+    // Use per-layer allocation based on layer_types
+    use_layer_types_ = true;
+    layer_shapes_.resize(layer_count_);
+    
+    int sliding_window_size = model_.config_->model.decoder.sliding_window->window_size;
+    int max_length = state_.params_->search.max_length;
+    
+    for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
+      layer_shapes_[layer_idx] = shape_;  // Copy base shape
+      
+      const std::string& layer_type = model_.config_->model.decoder.sliding_window->layer_types[layer_idx];
+      if (layer_type == "sliding_attention") {
+        layer_shapes_[layer_idx][2] = std::min(max_length, sliding_window_size);
+      } else {  // "full_attention"
+        layer_shapes_[layer_idx][2] = max_length;
+      }
+    }
+  } else if (state.model_.p_device_->GetType() == DeviceType::NvTensorRtRtx &&
+             model_.config_->model.decoder.sliding_window.has_value() &&
+             model_.config_->model.decoder.sliding_window->window_size > 0) {
+    // Uniform sliding window allocation (backward compatibility)
     shape_[2] = std::min(state_.params_->search.max_length,
                          model_.config_->model.decoder.sliding_window->window_size);
   } else if (past_present_share_buffer_) {
     shape_[2] = state_.params_->search.max_length;
   }
 
   try {
-    for (int i = 0; i < layer_count_ * 2; ++i) {
-      presents_.push_back(OrtValue::CreateTensor(Allocator(), shape_, type_));
-
-      // Zero the memory so we don't leak any data from the previous run
-      // WebGPU device has no Zero() implementation yet. Since this zeroing is optional we disable it for WebGPU for now
-      if (Device().GetType() != DeviceType::WEBGPU) {
-        ByteWrapTensor(Device(), *presents_.back()).Zero();
+    if (use_layer_types_) {
+      // Allocate per-layer with different shapes
+      for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
+        // Key tensor
+        presents_.push_back(OrtValue::CreateTensor(Allocator(), layer_shapes_[layer_idx], type_));
+        if (Device().GetType() != DeviceType::WEBGPU) {
+          ByteWrapTensor(Device(), *presents_.back()).Zero();
+        }
+        
+        // Value tensor
+        presents_.push_back(OrtValue::CreateTensor(Allocator(), layer_shapes_[layer_idx], type_));
+        if (Device().GetType() != DeviceType::WEBGPU) {
+          ByteWrapTensor(Device(), *presents_.back()).Zero();
+        }
+      }
+    } else {
+      // Uniform allocation (existing behavior)
+      for (int i = 0; i < layer_count_ * 2; ++i) {
+        presents_.push_back(OrtValue::CreateTensor(Allocator(), shape_, type_));
+
+        // Zero the memory so we don't leak any data from the previous run
+        // WebGPU device has no Zero() implementation yet. Since this zeroing is optional we disable it for WebGPU for now
+        if (Device().GetType() != DeviceType::WEBGPU) {
+          ByteWrapTensor(Device(), *presents_.back()).Zero();
+        }
       }
     }
   } catch (const Ort::Exception&) {
@@ -240,10 +281,30 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
     }
   }
 
-  shape_[2] = total_length;
-  for (int i = 0; i < layer_count_ * 2; i++) {
-    presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_);
-    state_.outputs_[output_index_ + i] = presents_[i].get();
+  if (use_layer_types_) {
+    // Update per-layer shapes based on total_length, but respect max allocations
+    for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
+      int max_cache_length = static_cast<int>(layer_shapes_[layer_idx][2]);
+      int actual_length = std::min(total_length, max_cache_length);
+      
+      std::array<int64_t, 4> current_shape = layer_shapes_[layer_idx];
+      current_shape[2] = actual_length;
+      
+      // Key tensor
+      presents_[layer_idx * 2] = OrtValue::CreateTensor(Allocator(), current_shape, type_);
+      state_.outputs_[output_index_ + layer_idx * 2] = presents_[layer_idx * 2].get();
+      
+      // Value tensor
+      presents_[layer_idx * 2 + 1] = OrtValue::CreateTensor(Allocator(), current_shape, type_);
+      state_.outputs_[output_index_ + layer_idx * 2 + 1] = presents_[layer_idx * 2 + 1].get();
+    }
+  } else {
+    // Uniform shape update (existing behavior)
+    shape_[2] = total_length;
+    for (int i = 0; i < layer_count_ * 2; i++) {
+      presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_);
+      state_.outputs_[output_index_ + i] = presents_[i].get();
+    }
   }
 
   is_first_update_ = false;
@@ -271,39 +332,90 @@ void DefaultKeyValueCache::RewindTo(size_t index) {
 
 template <typename T>
 void DefaultKeyValueCache::RewindPastTensorsTo(size_t index) {
-  assert(index > 0 && shape_[2] >= static_cast<int64_t>(index) && !past_present_share_buffer_);
-  std::array<int64_t, 4> new_shape = shape_;
-  new_shape[2] = static_cast<int>(index);
-  auto batch_x_num_heads = new_shape[0] * new_shape[1];
-  auto new_length_x_head_size = new_shape[2] * new_shape[3];
-  auto old_length_x_head_size = shape_[2] * new_shape[3];
-  shape_[2] = new_shape[2];
-
-  for (int i = 0; i < layer_count_ * 2; i++) {
-    OrtValue& present = *presents_[i];
-    std::unique_ptr<OrtValue> past = OrtValue::CreateTensor(Allocator(), shape_, type_);
+  assert(index > 0 && !past_present_share_buffer_);
+  
+  if (use_layer_types_) {
+    // Handle per-layer shapes
+    for (int i = 0; i < layer_count_ * 2; i++) {
+      int layer_idx = i / 2;
+      std::array<int64_t, 4> layer_shape = layer_shapes_[layer_idx];
+      int max_cache_length = static_cast<int>(layer_shape[2]);
+      
+      // Ensure we don't rewind beyond what's available
+      if (static_cast<int>(index) > max_cache_length) {
+        throw std::runtime_error("Requested rewind length is greater than the layer's cache length.");
+      }
+      
+      std::array<int64_t, 4> new_shape = layer_shape;
+      new_shape[2] = static_cast<int>(index);
+      auto batch_x_num_heads = new_shape[0] * new_shape[1];
+      auto new_length_x_head_size = new_shape[2] * new_shape[3];
+      
+      OrtValue& present = *presents_[i];
+      auto present_shape = present.GetTensorTypeAndShapeInfo()->GetShape();
+      auto old_length_x_head_size = present_shape[2] * new_shape[3];
+      
+      std::unique_ptr<OrtValue> past = OrtValue::CreateTensor(Allocator(), new_shape, type_);
+      auto past_span = WrapTensor<T>(Device(), *past);
+      auto present_span = WrapTensor<T>(Device(), present);
+
+      for (int j = 0; j < batch_x_num_heads; j++) {
+        auto present_data = present_span.subspan(j * old_length_x_head_size, new_length_x_head_size);
+        auto past_data = past_span.subspan(j * new_length_x_head_size, new_length_x_head_size);
+        past_data.CopyFrom(present_data);
+      }
+      pasts_[i] = std::move(past);
+      state_.inputs_[input_index_ + i] = pasts_[i].get();
+    }
+  } else {
+    // Uniform shape handling (existing behavior)
+    assert(shape_[2] >= static_cast<int64_t>(index));
+    std::array<int64_t, 4> new_shape = shape_;
+    new_shape[2] = static_cast<int>(index);
+    auto batch_x_num_heads = new_shape[0] * new_shape[1];
+    auto new_length_x_head_size = new_shape[2] * new_shape[3];
+    auto old_length_x_head_size = shape_[2] * new_shape[3];
+    shape_[2] = new_shape[2];
 
-    auto past_span = WrapTensor<T>(Device(), *past);
-    auto present_span = WrapTensor<T>(Device(), present);
+    for (int i = 0; i < layer_count_ * 2; i++) {
+      OrtValue& present = *presents_[i];
+      std::unique_ptr<OrtValue> past = OrtValue::CreateTensor(Allocator(), shape_, type_);
 
-    for (int j = 0; j < batch_x_num_heads; j++) {
-      auto present_data = present_span.subspan(j * old_length_x_head_size, new_length_x_head_size);
-      auto past_data = past_span.subspan(j * new_length_x_head_size, new_length_x_head_size);
-      past_data.CopyFrom(present_data);
+      auto past_span = WrapTensor<T>(Device(), *past);
+      auto present_span = WrapTensor<T>(Device(), present);
+
+      for (int j = 0; j < batch_x_num_heads; j++) {
+        auto present_data = present_span.subspan(j * old_length_x_head_size, new_length_x_head_size);
+        auto past_data = past_span.subspan(j * new_length_x_head_size, new_length_x_head_size);
+        past_data.CopyFrom(present_data);
+      }
+      pasts_[i] = std::move(past);
+      state_.inputs_[input_index_ + i] = pasts_[i].get();
     }
-    pasts_[i] = std::move(past);
-    state_.inputs_[input_index_ + i] = pasts_[i].get();
   }
 }
 
 // Copy present state to past state reordered by the beam_indices
 template <typename ScoreType>
 void DefaultKeyValueCache::PickPastState(DeviceSpan<int32_t> beam_indices_device, int index) {
   std::span<int32_t> beam_indices = beam_indices_device.CopyDeviceToCpu();
-  auto block_size_per_beam = shape_[1] * shape_[2] * shape_[3];
+  
+  std::array<int64_t, 4> tensor_shape;
+  if (use_layer_types_) {
+    // Get shape from the actual tensor for per-layer allocation
+    OrtValue& present_value = *presents_[index];
+    auto present_shape = present_value.GetTensorTypeAndShapeInfo()->GetShape();
+    for (size_t i = 0; i < 4; i++) {
+      tensor_shape[i] = present_shape[i];
+    }
+  } else {
+    tensor_shape = shape_;
+  }
+  
+  auto block_size_per_beam = tensor_shape[1] * tensor_shape[2] * tensor_shape[3];
 
   OrtValue& present_value = *presents_[index];
-  std::unique_ptr<OrtValue> past_value = OrtValue::CreateTensor<ScoreType>(Allocator(), shape_);
+  std::unique_ptr<OrtValue> past_value = OrtValue::CreateTensor<ScoreType>(Allocator(), tensor_shape);
 
   auto past_span = WrapTensor<ScoreType>(Device(), *past_value);
   auto present_span = WrapTensor<ScoreType>(Device(), present_value);
diff --git a/src/models/kv_cache.h b/src/models/kv_cache.h
@@ -97,6 +97,10 @@ struct DefaultKeyValueCache : KeyValueCache {
   std::array<int64_t, 4> shape_;
   ONNXTensorElementDataType type_;
 
+  // Support for per-layer KV cache shapes (for models with alternating attention patterns)
+  bool use_layer_types_{false};
+  std::vector<std::array<int64_t, 4>> layer_shapes_;
+
   std::unique_ptr<OrtValue> empty_past_;
   std::vector<std::unique_ptr<OrtValue>> pasts_, presents_;
   std::vector<std::string> input_name_strings_, output_name_strings_;
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -464,6 +464,11 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
 
         if self.ep == "trt-rtx" and self.window_size is not None and self.window_size > 0:
             genai_config["model"]["decoder"]["sliding_window"] = {"window_size": self.window_size, "slide_key_value_cache": False, "slide_inputs": False}
+            
+            # Add layer-specific attention types if model has alternating attention patterns
+            layer_types = self.get_layer_types()
+            if layer_types is not None:
+                genai_config["model"]["decoder"]["sliding_window"]["layer_types"] = layer_types
 
         if self.ep != "cpu":
             ep_name = self.ep.replace("trt-rtx", "NvTensorRtRtx")
@@ -474,6 +479,25 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
         with open(os.path.join(out_dir,"genai_config.json"), "w") as f:
             json.dump(genai_config, f, indent=4)
 
+    def get_layer_types(self):
+        """
+        Returns a list of attention types for each layer.
+        Override in subclasses to provide layer-specific attention patterns.
+        Returns None for models with uniform attention across all layers.
+        """
+        return None
+
+    def use_alternating_kv_dimensions(self):
+        """
+        Returns True if this model needs alternating KV cache dimension names.
+        This is needed for models with alternating attention patterns when using TensorRT.
+        """
+        # Enable for models with layer_types when using TensorRT EP
+        if self.ep == "trt-rtx" and hasattr(self, 'get_layer_types'):
+            layer_types = self.get_layer_types()
+            return layer_types is not None
+        return False
+
     def save_processing(self, model_name_or_path, extra_kwargs, out_dir):
         tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=self.hf_remote, **extra_kwargs)
         print(f"Saving processing files in {out_dir} for GenAI")
@@ -650,17 +674,39 @@ def make_inputs_and_outputs(self):
 
         # Add KV cache to inputs and outputs
         for i in range(self.num_layers):
+            # Use alternating dimension names if needed (for TensorRT with alternating attention)
+            if self.use_alternating_kv_dimensions():
+                layer_types = self.get_layer_types()
+                layer_type = layer_types[i] if layer_types and i < len(layer_types) else "full_attention"
+                
+                # Use dimension name based on attention type
+                if layer_type == "sliding_attention":
+                    dim_suffix = "_sliding"
+                else:  # "full_attention"
+                    dim_suffix = "_full"
+                
+                past_key_shape = ["batch_size", self.num_kv_heads, f"past_sequence_length{dim_suffix}", self.head_size]
+                past_value_shape = ["batch_size", self.num_kv_heads, f"past_sequence_length{dim_suffix}", self.head_size]
+                present_key_shape = ["batch_size", self.num_kv_heads, f"total_sequence_length{dim_suffix}", self.head_size]
+                present_value_shape = ["batch_size", self.num_kv_heads, f"total_sequence_length{dim_suffix}", self.head_size]
+            else:
+                # Use standard dimension names (current behavior)
+                past_key_shape = self.input_shapes["past_key_values.key"]
+                past_value_shape = self.input_shapes["past_key_values.value"]
+                present_key_shape = self.output_shapes["present.key"]
+                present_value_shape = self.output_shapes["present.value"]
+            
             # Add KV cache to inputs
             key_name = f"past_key_values.{i}.key"
-            inputs.append(self.make_value(key_name, dtype=self.input_types["past_key_values.key"], shape=self.input_shapes["past_key_values.key"]))
+            inputs.append(self.make_value(key_name, dtype=self.input_types["past_key_values.key"], shape=past_key_shape))
             value_name = f"past_key_values.{i}.value"
-            inputs.append(self.make_value(value_name, dtype=self.input_types["past_key_values.value"], shape=self.input_shapes["past_key_values.value"]))
+            inputs.append(self.make_value(value_name, dtype=self.input_types["past_key_values.value"], shape=past_value_shape))
 
             # Add KV cache to outputs
             key_name = f"present.{i}.key"
-            outputs.append(self.make_value(key_name, dtype=self.output_types["present.key"], shape=self.output_shapes["present.key"]))
+            outputs.append(self.make_value(key_name, dtype=self.output_types["present.key"], shape=present_key_shape))
             value_name = f"present.{i}.value"
-            outputs.append(self.make_value(value_name, dtype=self.output_types["present.value"], shape=self.output_shapes["present.value"]))
+            outputs.append(self.make_value(value_name, dtype=self.output_types["present.value"], shape=present_value_shape))
 
     def make_constant(self, name):
         # Make constant ops for 0, 1, 2, 3, etc.
@@ -3455,6 +3501,20 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         super().make_attention(layer_id, attention, root_input, **kwargs)
         self.window_size = original_window_size
 
+    def get_layer_types(self):
+        """
+        Gemma2 uses alternating attention patterns:
+        - Even layers (0, 2, 4, ...): full_attention
+        - Odd layers (1, 3, 5, ...): sliding_attention
+        """
+        layer_types = []
+        for layer_id in range(self.num_layers):
+            if self.is_local(layer_id):
+                layer_types.append("sliding_attention")
+            else:
+                layer_types.append("full_attention")
+        return layer_types
+
 
 class Phi3MiniModel(MistralModel):
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):