feat: support structured outputs in LlamaStackChatGenerator (#2535)

Amnah199 · anakin87 · web-flow · commit 4b463848ccb0 · 2025-11-23T21:26:32.000+05:00
* Support structured outputs

* Fix tests

* Fix linting

* Update tests

* Update integrations/llama_stack/tests/test_llama_stack_chat_generator.py

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;

---------

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;
diff --git a/integrations/llama_stack/examples/chatgenerator_with_structured_outputs.py b/integrations/llama_stack/examples/chatgenerator_with_structured_outputs.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+# This example demonstrates how to use the LlamaStackChatGenerator component
+# with structured outputs.
+# To run this example, you will need to
+# set up Llama Stack Server and have a model available
+
+from haystack.dataclasses import ChatMessage
+from pydantic import BaseModel
+
+from haystack_integrations.components.generators.llama_stack import LlamaStackChatGenerator
+
+
+class NobelPrizeInfo(BaseModel):
+    recipient_name: str
+    award_year: int
+    category: str
+    achievement_description: str
+    nationality: str
+
+
+chat_messages = [
+    ChatMessage.from_user(
+        "In 2021, American scientist David Julius received the Nobel Prize in"
+        " Physiology or Medicine for his groundbreaking discoveries on how the human body"
+        " senses temperature and touch."
+    )
+]
+component = LlamaStackChatGenerator(generation_kwargs={"response_format": NobelPrizeInfo})
+results = component.run(chat_messages)
+
+# print(results)
diff --git a/integrations/llama_stack/src/haystack_integrations/components/generators/llama_stack/chat/chat_generator.py b/integrations/llama_stack/src/haystack_integrations/components/generators/llama_stack/chat/chat_generator.py
@@ -10,6 +10,8 @@
 from haystack.tools import ToolsType, deserialize_tools_or_toolset_inplace, serialize_tools_or_toolset
 from haystack.utils import deserialize_callable, serialize_callable
 from haystack.utils.auth import Secret
+from openai.lib._pydantic import to_strict_json_schema
+from pydantic import BaseModel
 
 logger = logging.getLogger(__name__)
 
@@ -94,6 +96,13 @@ def __init__(
                 events as they become available, with the stream terminated by a data: [DONE] message.
             - `safe_prompt`: Whether to inject a safety prompt before all conversations.
             - `random_seed`: The seed to use for random sampling.
+            - `response_format`: A JSON schema or a Pydantic model that enforces the structure of the model's response.
+                If provided, the output will always be validated against this
+                format (unless the model returns a tool call).
+                For details, see the [OpenAI Structured Outputs documentation](https://platform.openai.com/docs/guides/structured-outputs).
+                Notes:
+                - For structured outputs with streaming,
+                  the `response_format` must be a JSON schema and not a Pydantic model.
         :param timeout:
             Timeout for client calls using OpenAI API. If not set, it defaults to either the
             `OPENAI_TIMEOUT` environment variable, or 30 seconds.
@@ -137,13 +146,29 @@ def to_dict(self) -> dict[str, Any]:
             The serialized component as a dictionary.
         """
         callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
+        generation_kwargs = self.generation_kwargs.copy()
+        response_format = generation_kwargs.get("response_format")
+        # If the response format is a Pydantic model, it's converted to openai's json schema format
+        # If it's already a json schema, it's left as is
+        if response_format and isinstance(response_format, type) and issubclass(response_format, BaseModel):
+            json_schema = {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": response_format.__name__,
+                    "strict": True,
+                    "schema": to_strict_json_schema(response_format),
+                },
+            }
+
+            generation_kwargs["response_format"] = json_schema
+
         return default_to_dict(
             self,
             model=self.model,
             streaming_callback=callback_name,
             api_base_url=self.api_base_url,
             organization=self.organization,
-            generation_kwargs=self.generation_kwargs,
+            generation_kwargs=generation_kwargs,
             timeout=self.timeout,
             max_retries=self.max_retries,
             tools=serialize_tools_or_toolset(self.tools),
diff --git a/integrations/llama_stack/tests/test_llama_stack_chat_generator.py b/integrations/llama_stack/tests/test_llama_stack_chat_generator.py
@@ -1,3 +1,4 @@
+import json
 from datetime import datetime
 from unittest.mock import patch
 
@@ -8,10 +9,22 @@
 from haystack.tools import Tool, Toolset
 from openai.types.chat import ChatCompletion, ChatCompletionMessage
 from openai.types.chat.chat_completion import Choice
+from pydantic import BaseModel
 
 from haystack_integrations.components.generators.llama_stack.chat.chat_generator import LlamaStackChatGenerator
 
 
+class CalendarEvent(BaseModel):
+    event_name: str
+    event_date: str
+    event_location: str
+
+
+@pytest.fixture
+def calendar_event_model():
+    return CalendarEvent
+
+
 @pytest.fixture
 def chat_messages():
     return [
@@ -135,12 +148,17 @@ def test_to_dict_default(
 
     def test_to_dict_with_parameters(
         self,
+        calendar_event_model,
     ):
         component = LlamaStackChatGenerator(
             model="ollama/llama3.2:3b",
             streaming_callback=print_streaming_chunk,
             api_base_url="test-base-url",
-            generation_kwargs={"max_tokens": 10, "some_test_param": "test-params"},
+            generation_kwargs={
+                "max_tokens": 10,
+                "some_test_param": "test-params",
+                "response_format": calendar_event_model,
+            },
             timeout=10,
             max_retries=10,
             tools=None,
@@ -158,7 +176,28 @@ def test_to_dict_with_parameters(
             "model": "ollama/llama3.2:3b",
             "api_base_url": "test-base-url",
             "streaming_callback": "haystack.components.generators.utils.print_streaming_chunk",
-            "generation_kwargs": {"max_tokens": 10, "some_test_param": "test-params"},
+            "generation_kwargs": {
+                "max_tokens": 10,
+                "some_test_param": "test-params",
+                "response_format": {
+                    "type": "json_schema",
+                    "json_schema": {
+                        "name": "CalendarEvent",
+                        "strict": True,
+                        "schema": {
+                            "properties": {
+                                "event_name": {"title": "Event Name", "type": "string"},
+                                "event_date": {"title": "Event Date", "type": "string"},
+                                "event_location": {"title": "Event Location", "type": "string"},
+                            },
+                            "required": ["event_name", "event_date", "event_location"],
+                            "title": "CalendarEvent",
+                            "type": "object",
+                            "additionalProperties": False,
+                        },
+                    },
+                },
+            },
             "timeout": 10,
             "max_retries": 10,
             "tools": None,
@@ -407,3 +446,52 @@ def test_live_run_with_mixed_tools(self, mixed_tools):
         assert len(final_message.text) > 0
         assert "paris" in final_message.text.lower()
         assert "berlin" in final_message.text.lower()
+
+    @pytest.mark.integration
+    def test_live_run_with_response_format_json_schema(self):
+        response_schema = {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "CapitalCity",
+                "strict": True,
+                "schema": {
+                    "title": "CapitalCity",
+                    "type": "object",
+                    "properties": {
+                        "city": {"title": "City", "type": "string"},
+                        "country": {"title": "Country", "type": "string"},
+                    },
+                    "required": ["city", "country"],
+                    "additionalProperties": False,
+                },
+            },
+        }
+
+        chat_messages = [ChatMessage.from_user("What's the capital of France?")]
+        comp = LlamaStackChatGenerator(
+            model="ollama/llama3.2:3b", generation_kwargs={"response_format": response_schema}
+        )
+        results = comp.run(chat_messages)
+        assert len(results["replies"]) == 1
+        message: ChatMessage = results["replies"][0]
+        msg = json.loads(message.text)
+        assert "Paris" in msg["city"]
+        assert isinstance(msg["country"], str)
+        assert "France" in msg["country"]
+        assert message.meta["finish_reason"] == "stop"
+
+    @pytest.mark.integration
+    def test_live_run_with_response_format_pydantic_model(self, calendar_event_model):
+        chat_messages = [
+            ChatMessage.from_user("The marketing summit takes place on October 12th at the Hilton Hotel downtown.")
+        ]
+        component = LlamaStackChatGenerator(
+            model="ollama/llama3.2:3b", generation_kwargs={"response_format": calendar_event_model}
+        )
+        results = component.run(chat_messages)
+        assert len(results["replies"]) == 1
+        message: ChatMessage = results["replies"][0]
+        msg = json.loads(message.text)
+        assert "Marketing Summit" in msg["event_name"]
+        assert isinstance(msg["event_date"], str)
+        assert isinstance(msg["event_location"], str)