update streaming for responses (#321)

svilupp · web-flow · commit 39d84022453a · 2025-11-28T14:24:23.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,7 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.88.0]
 
 ### Added
-- Added support for OpenAI's Responses API (`/responses` endpoint) via `OpenAIResponseSchema`. Supports reasoning traces, multi-turn conversations with `previous_response_id`, and structured extraction with `aiextract`. Use `aigenerate(OpenAIResponseSchema(), prompt; model="o4-mini")` for reasoning models (access via `result.extras[:reasoning_content]`). See `examples/working_with_responses_api.jl`. Note: Many features are not supported yet, eg, streaming, built-in tools, etc.
+- Added support for OpenAI's Responses API (`/responses` endpoint) via `OpenAIResponseSchema`. Supports reasoning traces, multi-turn conversations with `previous_response_id`, and structured extraction with `aiextract`. Use `aigenerate(OpenAIResponseSchema(), prompt; model="o4-mini")` for reasoning models (access via `result.extras[:reasoning_content]`). See `examples/working_with_responses_api.jl`. Note: Many features are not supported yet, eg, built-in tools, etc.
+- Added support for streaming responses with `OpenAIResponseSchema` via a dedicated `StreamCallback` flavor. See `examples/working_with_responses_api.jl`.
 
 ## [0.87.0]
 
diff --git a/Project.toml b/Project.toml
@@ -46,7 +46,7 @@ REPL = "<0.0.1, 1"
 Random = "<0.0.1, 1"
 SparseArrays = "<0.0.1, 1"
 Statistics = "<0.0.1, 1"
-StreamCallbacks = "0.6.2"
+StreamCallbacks = "0.7"
 Test = "<0.0.1, 1"
 Unicode = "<0.0.1, 1"
 julia = "1.9, 1.10, 1.11"
diff --git a/README.md b/README.md
@@ -619,6 +619,12 @@ println(msg.extras[:reasoning_content])
 # Continue conversations using previous_response_id
 msg2 = aigenerate(schema, "Tell me more";
     model="gpt-5-mini", previous_response_id=msg.extras[:response_id])
+
+# Streaming responses
+msg = aigenerate(schema, "Count from 1 to 10, one number per line.";
+    model = "gpt-5-mini",
+    streamcallback = stdout,
+    verbose = false)
 ```
 
 **When to use which API:**
diff --git a/docs/src/coverage_of_model_providers.md b/docs/src/coverage_of_model_providers.md
@@ -11,7 +11,7 @@ Below is an overview of the model providers supported by PromptingTools.jl, alon
 | Abstract Schema         | Schema                    | Model Provider                         | aigenerate | aiembed | aiextract | aiscan | aiimage | aiclassify |
 |-------------------------|---------------------------|----------------------------------------|------------|---------|-----------|--------|---------|------------|
 | AbstractOpenAISchema    | OpenAISchema              | OpenAI (Chat Completions)              | ✅         | ✅     | ✅       | ✅     | ✅     | ✅         |
-| AbstractResponseSchema  | OpenAIResponseSchema***   | OpenAI (Responses API)                 | ✅         | ❌     | ✅       | ❌     | ❌     | ❌         |
+| AbstractOpenAIResponseSchema  | OpenAIResponseSchema***   | OpenAI (Responses API)                 | ✅         | ❌     | ✅       | ❌     | ❌     | ❌         |
 | AbstractOpenAISchema    | CustomOpenAISchema*       | Any OpenAI-compatible API (eg, vLLM)*  | ✅         | ✅     | ✅       | ✅     | ✅     | ❌         |
 | AbstractOpenAISchema    | LocalServerOpenAISchema** | Any OpenAI-compatible Local server**   | ✅         | ✅     | ✅       | ✅     | ✅     | ❌         |
 | AbstractOpenAISchema    | MistralOpenAISchema       | Mistral AI                             | ✅         | ✅     | ✅       | ✅     | ✅     | ❌         |
diff --git a/examples/working_with_responses_api.jl b/examples/working_with_responses_api.jl
@@ -215,8 +215,8 @@ println("Usage: ", response.extras[:usage])
 # ## 10. Using with Templates
 #
 # Works with PromptingTools templates:
-
-response = aigenerate(schema, :BlankSystemUser;
+tpl = PT.render(AITemplate(:BlankSystemUser))
+response = aigenerate(schema, tpl;
     system = "You are a helpful coding assistant specialized in Julia.",
     user = "How do I read a CSV file?",
     model = "gpt-5-mini",
@@ -225,6 +225,53 @@ response = aigenerate(schema, :BlankSystemUser;
 println("\n=== Template Usage ===")
 println("Template response: ", response.content)
 
+# ## 11. Streaming Responses
+#
+# Stream responses in real-time for better interactivity.
+# Uses `OpenAIResponsesStream` flavor from StreamCallbacks.jl.
+
+using PromptingTools: StreamCallback
+
+# Basic streaming to stdout - see tokens appear as they're generated
+println("\n=== Streaming to stdout ===")
+response = aigenerate(schema, "Count from 1 to 10, one number per line.";
+    model = "gpt-5-mini",
+    streamcallback = stdout,
+    verbose = false)
+
+# Streaming with custom StreamCallback to capture chunks
+println("\n\n=== Streaming with StreamCallback ===")
+cb = StreamCallback()  # captures all chunks for inspection
+response = aigenerate(schema, "What is Julia in one sentence?";
+    model = "gpt-5-mini",
+    streamcallback = cb,
+    verbose = false)
+
+println("Final content: ", response.content)
+println("Number of chunks received: ", length(cb.chunks))
+
+# Streaming to an IOBuffer for programmatic capture
+output = IOBuffer()
+cb = StreamCallback(; out = output)
+response = aigenerate(schema, "Say hello in 3 languages.";
+    model = "gpt-5-mini",
+    streamcallback = cb,
+    verbose = false)
+
+streamed_text = String(take!(output))
+println("Captured streamed text: ", streamed_text)
+
+# Streaming with reasoning models - see reasoning and output streamed
+println("\n=== Streaming with Reasoning ===")
+cb = StreamCallback(; out = stdout)
+response = aigenerate(schema, "What is 15 * 7? Think step by step.";
+    model = "o4-mini",
+    api_kwargs = (reasoning = Dict("effort" => "medium", "summary" => "auto"),),
+    streamcallback = cb,
+    verbose = false)
+
+println("\nReasoning content: ", response.extras[:reasoning_content])
+
 # ## Summary of Key Features
 #
 # | Feature | How to Use |
@@ -236,4 +283,5 @@ println("Template response: ", response.content)
 # | Multi-turn (efficient) | `previous_response_id = response.extras[:response_id]` |
 # | Structured extraction | `aiextract(schema, prompt; return_type=MyStruct)` |
 # | Web search | `enable_websearch = true` |
+# | Streaming | `streamcallback = stdout` or `StreamCallback()` |
 # | Access reasoning | `response.extras[:reasoning_content]` |
diff --git a/src/PromptingTools.jl b/src/PromptingTools.jl
@@ -13,10 +13,9 @@ import Preferences
 using Preferences: @load_preference, @set_preferences!
 using PrecompileTools
 using StreamCallbacks
-using StreamCallbacks: OpenAIStream, AnthropicStream, OllamaStream, StreamCallback,
-                       StreamChunk, AbstractStreamCallback
-# ResponseStream will be available in a future StreamCallbacks release
-# For now, we use OpenAIStream as a fallback for the Responses API
+using StreamCallbacks: OpenAIStream, OpenAIResponsesStream, AnthropicStream, OllamaStream,
+                       StreamCallback, StreamChunk, AbstractStreamCallback,
+                       streamed_request!, build_response_body
 using Test, Pkg
 ## Added REPL because it extends methods in Base.docs for extraction of docstrings
 using REPL
diff --git a/src/llm_interface.jl b/src/llm_interface.jl
@@ -587,19 +587,19 @@ isextracted(x) = x isa AbstractExtractedData
 # which is used by models like gpt-5.1-codex that don't support the standard chat completions API.
 
 """
-    AbstractResponseSchema
+    AbstractOpenAIResponseSchema
 
-Abstract type for all response-based schemas that use the `/responses` endpoint instead of `/chat/completions`.
+Abstract type for all OpenAI response-based schemas that use the `/responses` endpoint instead of `/chat/completions`.
 """
-abstract type AbstractResponseSchema <: AbstractPromptSchema end
+abstract type AbstractOpenAIResponseSchema <: AbstractPromptSchema end
 
 """
-    OpenAIResponseSchema <: AbstractResponseSchema
+    OpenAIResponseSchema <: AbstractOpenAIResponseSchema
 
 A schema for OpenAI's Responses API (`/responses` endpoint).
 
 This schema is used for models that only support the Responses API, such as `gpt-5.1-codex`.
-Unlike the standard chat completions API, the Responses API uses `input` and `instructions` 
+Unlike the standard chat completions API, the Responses API uses `input` and `instructions`
 fields instead of a messages array.
 
 # Example
@@ -608,10 +608,10 @@ schema = OpenAIResponseSchema()
 response = aigenerate(schema, "What is Julia?"; model="gpt-5.1-codex")
 ```
 """
-struct OpenAIResponseSchema <: AbstractResponseSchema end
+struct OpenAIResponseSchema <: AbstractOpenAIResponseSchema end
 
 "Echoes the user's input back to them. Used for testing the Responses API implementation"
-@kwdef mutable struct TestEchoOpenAIResponseSchema <: AbstractResponseSchema
+@kwdef mutable struct TestEchoOpenAIResponseSchema <: AbstractOpenAIResponseSchema
     response::AbstractDict = Dict(
         "id" => "resp_test123",
         "object" => "response",
diff --git a/src/llm_openai_responses.jl b/src/llm_openai_responses.jl
@@ -15,7 +15,7 @@ function create_response(schema::TestEchoOpenAIResponseSchema, api_key::Abstract
 end
 
 """
-    create_response(schema::AbstractResponseSchema, api_key::AbstractString,
+    create_response(schema::AbstractOpenAIResponseSchema, api_key::AbstractString,
                    model::AbstractString,
                    input;
                    instructions::Union{Nothing, AbstractString} = nothing,
@@ -29,7 +29,7 @@ end
 Creates a response using the OpenAI Responses API with streaming support.
 
 # Arguments
-- `schema::AbstractResponseSchema`: The response schema to use
+- `schema::AbstractOpenAIResponseSchema`: The response schema to use
 - `api_key::AbstractString`: The API key to use for the OpenAI API
 - `model::AbstractString`: The model to use for generating the response
 - `input`: The input for the model, can be a string or structured input
@@ -46,7 +46,7 @@ Creates a response using the OpenAI Responses API with streaming support.
 # Returns
 - `response`: The response from the OpenAI API
 """
-function create_response(schema::AbstractResponseSchema, api_key::AbstractString,
+function create_response(schema::AbstractOpenAIResponseSchema, api_key::AbstractString,
         model::AbstractString,
         input;
         instructions::Union{Nothing, AbstractString} = nothing,
@@ -73,23 +73,19 @@ function create_response(schema::AbstractResponseSchema, api_key::AbstractString
         body["stream"] = true
     end
 
-    # Add all parameters from api_kwargs
+    # Add all parameters from api_kwargs (except url which is used for testing)
     # Supports: reasoning, text, temperature, max_output_tokens, etc.
     for (key, value) in pairs(api_kwargs)
+        key == :url && continue  # url is used for testing, not sent to API
         body[string(key)] = value
     end
 
-    # Make the API request
-    url = OpenAI.build_url(OpenAI.DEFAULT_PROVIDER, "responses")
+    # Make the API request (url can be overridden via api_kwargs for testing)
+    url = get(api_kwargs, :url, OpenAI.build_url(OpenAI.DEFAULT_PROVIDER, "responses"))
     headers = OpenAI.auth_header(OpenAI.DEFAULT_PROVIDER, api_key)
 
     if !isnothing(streamcallback)
-        # Streaming is not yet supported for the Responses API
-        # The Responses API uses a different SSE format than Chat Completions,
-        # requiring a dedicated ResponseStream flavor in StreamCallbacks.jl
-        throw(ArgumentError("Streaming is not yet supported for OpenAI Responses API (OpenAIResponseSchema). Use non-streaming requests for now."))
-
-        # Configure streaming callback - only pass schema, no extra kwargs
+        # Configure streaming callback
         streamcallback, stream_kwargs = configure_callback!(streamcallback, schema)
 
         # Convert body dict to IOBuffer for streaming (streamed_request! expects IOBuffer)
@@ -99,7 +95,10 @@ function create_response(schema::AbstractResponseSchema, api_key::AbstractString
 
         # Use streaming request
         resp = streamed_request!(streamcallback, url, headers, input; http_kwargs...)
-        return OpenAI.OpenAIResponse(resp.status, JSON3.read(resp.body))
+
+        # Build response body from chunks using StreamCallbacks
+        response_body = build_response_body(streamcallback.flavor, streamcallback)
+        return OpenAI.OpenAIResponse(resp.status, response_body)
     else
         # Convert the body to JSON for non-streaming
         json_body = JSON3.write(body)
@@ -111,7 +110,7 @@ function create_response(schema::AbstractResponseSchema, api_key::AbstractString
 end
 
 """
-    render(schema::AbstractResponseSchema, messages::Vector{<:AbstractMessage};
+    render(schema::AbstractOpenAIResponseSchema, messages::Vector{<:AbstractMessage};
            conversation::AbstractVector{<:AbstractMessage} = AbstractMessage[],
            no_system_message::Bool = false,
            kwargs...)
@@ -124,7 +123,7 @@ The Responses API expects:
 - `instructions`: System-level instructions (from SystemMessage, optional)
 
 # Arguments
-- `schema::AbstractResponseSchema`: The response schema
+- `schema::AbstractOpenAIResponseSchema`: The response schema
 - `messages::Vector{<:AbstractMessage}`: Messages to render
 - `conversation`: Previous conversation history (currently limited support)
 - `no_system_message`: If true, don't add default system message
@@ -133,7 +132,7 @@ The Responses API expects:
 # Returns
 - `NamedTuple{(:input, :instructions), Tuple{String, Union{Nothing, String}}}`: Rendered input and instructions
 """
-function render(schema::AbstractResponseSchema,
+function render(schema::AbstractOpenAIResponseSchema,
         messages::Vector{<:AbstractMessage};
         conversation::AbstractVector{<:AbstractMessage} = AbstractMessage[],
         no_system_message::Bool = false,
@@ -165,23 +164,23 @@ function render(schema::AbstractResponseSchema,
 end
 
 # Render for string prompts - wrap in UserMessage and process
-function render(schema::AbstractResponseSchema, prompt::AbstractString;
+function render(schema::AbstractOpenAIResponseSchema, prompt::AbstractString;
         no_system_message::Bool = true, kwargs...)
     render(schema, [UserMessage(prompt)]; no_system_message, kwargs...)
 end
 
 # Render for single message
-function render(schema::AbstractResponseSchema, msg::AbstractMessage; kwargs...)
+function render(schema::AbstractOpenAIResponseSchema, msg::AbstractMessage; kwargs...)
     render(schema, [msg]; kwargs...)
 end
 
 # Render for AITemplate
-function render(schema::AbstractResponseSchema, template::AITemplate; kwargs...)
+function render(schema::AbstractOpenAIResponseSchema, template::AITemplate; kwargs...)
     render(schema, render(template); kwargs...)
 end
 
 # Render for Symbol (template name)
-function render(schema::AbstractResponseSchema, template::Symbol; kwargs...)
+function render(schema::AbstractOpenAIResponseSchema, template::Symbol; kwargs...)
     render(schema, AITemplate(template); kwargs...)
 end
 
@@ -225,7 +224,7 @@ function extract_response_content(response)
 end
 
 """
-    aigenerate(schema::AbstractResponseSchema, prompt::ALLOWED_PROMPT_TYPE;
+    aigenerate(schema::AbstractOpenAIResponseSchema, prompt::ALLOWED_PROMPT_TYPE;
                previous_response_id::Union{Nothing, AbstractString} = nothing,
                enable_websearch::Bool = false,
                model::AbstractString = MODEL_CHAT,
@@ -238,7 +237,7 @@ Generate an AI response using the OpenAI Responses API with streaming support.
 Returns an AIMessage with the response content and additional information in the extras field.
 
 # Arguments
-- `schema::AbstractResponseSchema`: The schema to use (e.g., `OpenAIResponseSchema()`)
+- `schema::AbstractOpenAIResponseSchema`: The schema to use (e.g., `OpenAIResponseSchema()`)
 - `prompt`: The prompt to send to the API, can be:
   - A string (sent as user input)
   - A vector of AbstractMessages (SystemMessage becomes instructions, UserMessage becomes input)
@@ -285,7 +284,7 @@ response = aigenerate(schema, "Solve 2+2*3";
 println(response.extras[:reasoning_content])
 ```
 """
-function aigenerate(schema::AbstractResponseSchema, prompt::ALLOWED_PROMPT_TYPE;
+function aigenerate(schema::AbstractOpenAIResponseSchema, prompt::ALLOWED_PROMPT_TYPE;
         previous_response_id::Union{Nothing, AbstractString} = nothing,
         enable_websearch::Bool = false,
         model::AbstractString = MODEL_CHAT,
@@ -367,7 +366,7 @@ function aigenerate(schema::AbstractResponseSchema, prompt::ALLOWED_PROMPT_TYPE;
 end
 
 """
-    aiextract(schema::AbstractResponseSchema, prompt::ALLOWED_PROMPT_TYPE;
+    aiextract(schema::AbstractOpenAIResponseSchema, prompt::ALLOWED_PROMPT_TYPE;
               return_type::Union{Type, AbstractTool},
               model::AbstractString = MODEL_CHAT,
               api_key::AbstractString = "",
@@ -383,7 +382,7 @@ Note: Unlike the Chat Completions API, the Responses API `text.format` only supp
 JSON schema. For multi-type extraction (union of structs), use the Chat Completions API instead.
 
 # Arguments
-- `schema::AbstractResponseSchema`: The schema to use
+- `schema::AbstractOpenAIResponseSchema`: The schema to use
 - `prompt`: The input prompt
 - `return_type`: A Julia struct type or AbstractTool to extract (single type only)
 - `model`: The model to use
@@ -424,7 +423,7 @@ result = aiextract(schema, "Solve: What is 15% of 80?";
 println(result.extras[:reasoning_content])
 ```
 """
-function aiextract(schema::AbstractResponseSchema, prompt::ALLOWED_PROMPT_TYPE;
+function aiextract(schema::AbstractOpenAIResponseSchema, prompt::ALLOWED_PROMPT_TYPE;
         return_type::Union{Type, AbstractTool},
         model::AbstractString = MODEL_CHAT,
         api_key::AbstractString = "",
diff --git a/src/streaming.jl b/src/streaming.jl
@@ -22,18 +22,17 @@ function configure_callback!(cb::T, schema::AbstractPromptSchema;
             api_kwargs = (;
                 api_kwargs..., stream = true, stream_options = (; include_usage = true))
             flavor = OpenAIStream()
-        elseif schema isa AbstractResponseSchema
+        elseif schema isa AbstractOpenAIResponseSchema
             ## Enable streaming for Response API
-            ## Note: Using OpenAIStream until ResponseStream is available in StreamCallbacks
             api_kwargs = (; api_kwargs..., stream = true)
-            flavor = OpenAIStream()
+            flavor = OpenAIResponsesStream()
         elseif schema isa Union{AbstractAnthropicSchema, AbstractOllamaSchema}
             api_kwargs = (; api_kwargs..., stream = true)
             flavor = schema isa AbstractOllamaSchema ? OllamaStream() : AnthropicStream()
         elseif schema isa AbstractOllamaManagedSchema
             throw(ErrorException("OllamaManagedSchema is not supported for streaming. Use OllamaSchema instead."))
         else
-            error("Unsupported schema type: $(typeof(schema)). Currently supported: OpenAISchema, AbstractResponseSchema, and AnthropicSchema.")
+            error("Unsupported schema type: $(typeof(schema)). Currently supported: OpenAISchema, AbstractOpenAIResponseSchema, and AnthropicSchema.")
         end
         cb.flavor = flavor
     end
diff --git a/test/llm_openai_chat.jl b/test/llm_openai_chat.jl
diff --git a/test/llm_openai_responses.jl b/test/llm_openai_responses.jl
diff --git a/test/runtests.jl b/test/runtests.jl
diff --git a/test/streaming.jl b/test/streaming.jl