Skip to content

Commit 5e9a1b1

Browse files
use openai web saerch, update models used (#1894)
1 parent 02159ed commit 5e9a1b1

File tree

9 files changed

+355
-352
lines changed

9 files changed

+355
-352
lines changed

authors.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
# You can optionally customize how your information shows up cookbook.openai.com over here.
44
# If your information is not present here, it will be pulled from your GitHub profile.
55

6+
moustafa-openai:
7+
name: "Moustafa Elhadary"
8+
website: "https://www.linkedin.com/in/moustafaelhadary/"
9+
avatar: "https://avatars.githubusercontent.com/u/198829901?v=4"
10+
611
theophile-openai:
712
name: "Theophile Sautory"
813
website: "https://www.linkedin.com/in/theophilesautory"

examples/agents_sdk/AI_Research_Assistant_Cookbook.ipynb

Lines changed: 69 additions & 74 deletions
Large diffs are not rendered by default.

examples/agents_sdk/REPORT_DRAFT.md

Lines changed: 35 additions & 31 deletions
Large diffs are not rendered by default.

examples/agents_sdk/ai_research_assistant_resources/agents_tools_registry/query_expansion_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class QueryExpansionAgent:
5353
5454
"""
5555

56-
def __init__(self, *, model: str = "o3-mini", tools: list | None = None, name: str | None = None,
56+
def __init__(self, *, model: str = "o4-mini", tools: list | None = None, name: str | None = None,
5757
instructions: str | None = None, input_guardrails: list | None = None):
5858

5959
# Initialise the underlying `agents.Agent` with a structured `output_type` so it

examples/agents_sdk/ai_research_assistant_resources/agents_tools_registry/web_page_summary_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def __init__(
1717
search_term: str,
1818
character_limit: int = 1000,
1919
*,
20-
model: str = "gpt-4o",
20+
model: str = "gpt-4.1",
2121
tools: list | None = None,
2222
name: str | None = None,
2323
instructions: str | None = None,

examples/agents_sdk/ai_research_assistant_resources/agents_tools_registry/web_search_terms_generation_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def __init__(
3636
self,
3737
num_search_terms: int = _NUM_SEARCH_TERMS,
3838
*,
39-
model: str = "gpt-4o",
39+
model: str = "gpt-4.1",
4040
tools: list | None = None,
4141
name: str | None = None,
4242
instructions: str | None = None,

examples/agents_sdk/ai_research_assistant_resources/guardrails/topic_content_guardrail.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,13 @@
1616
# 1. Tiny classifier agent → “Is this prompt about AI?”
1717
# ---------------------------------------------------------------------------
1818

19+
1920
class TopicCheckOutput(BaseModel):
2021
"""Structured result returned by the classifier."""
21-
is_about_ai: bool # True → prompt is AI-related
22-
reasoning: str # short rationale (useful for logs)
22+
23+
is_about_ai: bool # True → prompt is AI-related
24+
reasoning: str # short rationale (useful for logs)
25+
2326

2427
topic_guardrail_agent = Agent(
2528
name="Topic guardrail (AI)",
@@ -30,16 +33,17 @@ class TopicCheckOutput(BaseModel):
3033
"policy, or market trends. "
3134
"Return is_about_ai = false for all other domains (finance, biology, history, etc.)."
3235
),
33-
model="gpt-4o-mini", # lightweight, fast
36+
model="gpt-4.1-mini", # lightweight, fast
3437
output_type=TopicCheckOutput,
3538
)
3639

3740
# ---------------------------------------------------------------------------
3841
# 2. Guardrail function (decorated) that wraps the classifier
3942
# ---------------------------------------------------------------------------
4043

44+
4145
@input_guardrail
42-
async def ai_topic_guardrail(
46+
async def ai_topic_guardrail(
4347
ctx: RunContextWrapper[None],
4448
agent: Agent,
4549
input: str | List[TResponseInputItem],
@@ -53,5 +57,6 @@ async def ai_topic_guardrail(
5357

5458
return output
5559

60+
5661
# Optional: tidy public surface
57-
__all__ = ["ai_topic_guardrail", "TopicCheckOutput"]
62+
__all__ = ["ai_topic_guardrail", "TopicCheckOutput"]
Lines changed: 38 additions & 203 deletions
Original file line numberDiff line numberDiff line change
@@ -1,209 +1,44 @@
1-
# web_search_and_util.py
2-
3-
from bs4 import BeautifulSoup
4-
import requests
5-
from dotenv import load_dotenv
6-
import os
7-
8-
load_dotenv('.env')
9-
10-
api_key = os.getenv('API_KEY')
11-
cse_id = os.getenv('CSE_ID')
12-
13-
TRUNCATE_SCRAPED_TEXT = 50000 # Adjust based on your model's context window
14-
SEARCH_DEPTH = 2 # Default depth for Google Custom Search queries
15-
16-
# ------------------------------------------------------------------
17-
# Optional: patch asyncio to allow nested event loops (e.g., inside Jupyter)
18-
# ------------------------------------------------------------------
19-
20-
try:
21-
import nest_asyncio # type: ignore
22-
23-
# ``nest_asyncio`` monkey-patches the running event-loop so that further
24-
# calls to ``asyncio.run`` or ``loop.run_until_complete`` do **not** raise
25-
# ``RuntimeError: This event loop is already running``. This makes the
26-
# synchronous helper functions below safe to call in notebook cells while
27-
# still working unchanged in regular Python scripts.
28-
29-
nest_asyncio.apply()
30-
except ImportError: # pragma: no cover
31-
# ``nest_asyncio`` is an optional dependency. If it is unavailable we
32-
# simply skip patching – the helper functions will still work in regular
33-
# Python scripts but may raise ``RuntimeError`` when called from within
34-
# environments that already run an event-loop (e.g., Jupyter).
35-
pass
36-
37-
def search(search_item, api_key, cse_id, search_depth=SEARCH_DEPTH, site_filter=None):
38-
service_url = 'https://www.googleapis.com/customsearch/v1'
39-
40-
params = {
41-
'q': search_item,
42-
'key': api_key,
43-
'cx': cse_id,
44-
'num': search_depth
45-
}
46-
47-
if api_key is None or cse_id is None:
48-
raise ValueError("API key and CSE ID are required")
49-
50-
try:
51-
response = requests.get(service_url, params=params)
52-
response.raise_for_status()
53-
results = response.json()
54-
55-
# ------------------------------------------------------------------
56-
# Robust handling – always return a *list* (never ``None``)
57-
# ------------------------------------------------------------------
58-
items = results.get("items", [])
59-
60-
# Optional site filtering
61-
if site_filter:
62-
items = [itm for itm in items if site_filter in itm.get("link", "")]
63-
if not items:
64-
print(f"No results with {site_filter} found.")
65-
66-
# Graceful handling of empty results
67-
if not items:
68-
print("No search results found.")
69-
return []
70-
71-
return items
72-
73-
except requests.exceptions.RequestException as e:
74-
print(f"An error occurred during the search: {e}")
75-
return []
76-
77-
78-
79-
80-
def retrieve_content(url, max_tokens=TRUNCATE_SCRAPED_TEXT):
81-
try:
82-
headers = {'User-Agent': 'Mozilla/5.0'}
83-
response = requests.get(url, headers=headers, timeout=10)
84-
response.raise_for_status()
85-
86-
soup = BeautifulSoup(response.content, 'html.parser')
87-
for script_or_style in soup(['script', 'style']):
88-
script_or_style.decompose()
89-
90-
text = soup.get_text(separator=' ', strip=True)
91-
characters = max_tokens * 4 # Approximate conversion
92-
text = text[:characters]
93-
return text
94-
except requests.exceptions.RequestException as e:
95-
print(f"Failed to retrieve {url}: {e}")
96-
return None
97-
98-
99-
100-
async def get_search_results(search_items, search_term: str, character_limit: int = 500):
101-
# Generate a summary of search results for the given search term
102-
results_list = []
103-
for idx, item in enumerate(search_items, start=1):
104-
url = item.get('link')
105-
106-
snippet = item.get('snippet', '')
107-
web_content = retrieve_content(url, TRUNCATE_SCRAPED_TEXT)
108-
109-
if web_content is None:
110-
print(f"Error: skipped URL: {url}")
111-
else:
112-
summary = summarize_content(web_content, search_term, character_limit)
113-
result_dict = {
114-
'order': idx,
115-
'link': url,
116-
'title': snippet,
117-
'Summary': summary
118-
}
119-
results_list.append(result_dict)
120-
return results_list
121-
122-
# ------------------------------------------------------------------
123-
# Helper using WebPageSummaryAgent for content summarisation
124-
# ------------------------------------------------------------------
125-
# NOTE:
126-
# ``WebPageSummaryAgent`` is an agent wrapper that internally spins up an
127-
# ``agents.Agent`` instance with the correct system prompt for Web page
128-
# summarisation. Because the ``task`` method on the wrapper is *async*, we
129-
# provide a small synchronous wrapper that takes care of running the coroutine
130-
# irrespective of whether the caller is inside an active event-loop (e.g.
131-
# Jupyter notebooks) or not.
132-
133-
from ai_research_assistant_resources.agents_tools_registry.web_page_summary_agent import WebPageSummaryAgent
134-
import asyncio
135-
136-
137-
def summarize_content(content: str, search_term: str, character_limit: int = 2000) -> str: # noqa: D401
138-
139-
# Instantiate the agent with the dynamic instructions.
140-
agent = WebPageSummaryAgent(search_term=search_term, character_limit=character_limit)
1+
from openai import OpenAI
2+
3+
client = OpenAI()
4+
5+
6+
def openai_web_search(
7+
query: str,
8+
model: str = "gpt-4.1",
9+
search_context_size: str = "high",
10+
) -> dict:
11+
resp = client.responses.create(
12+
model=model,
13+
tools=[
14+
{"type": "web_search_preview", "search_context_size": search_context_size}
15+
],
16+
input=f"Search the web for the following information and provide citations: {query}",
17+
)
14118

142-
# Run the agent task, making sure we properly handle the presence (or
143-
# absence) of an already-running event-loop.
144-
try:
145-
return asyncio.run(agent.task(content))
146-
except RuntimeError:
147-
# We are *probably* inside an existing event-loop (common in notebooks
148-
# or async frameworks). In that case fall back to using the current
149-
# loop instead of creating a new one.
150-
loop = asyncio.get_event_loop()
151-
return loop.run_until_complete(agent.task(content))
19+
answer = ""
20+
citations = []
15221

22+
for item in resp.output:
23+
if item.type == "message":
24+
for part in item.content:
25+
if part.type == "output_text":
26+
answer = part.text
27+
for ann in part.annotations or []:
28+
if ann.type == "url_citation":
29+
citations.append(
30+
{
31+
"url": ann.url,
32+
"title": getattr(ann, "title", None),
33+
"start_index": getattr(ann, "start_index", None),
34+
"end_index": getattr(ann, "end_index", None),
35+
}
36+
)
15337

154-
# ------------------------------------------------------------------
155-
# High-level convenience API
156-
# ------------------------------------------------------------------
38+
return {"answer": answer, "citations": citations}
15739

15840

15941
def get_results_for_search_term(
160-
search_term: str,
161-
*,
162-
character_limit: int = 2000,
163-
search_depth: int = SEARCH_DEPTH,
164-
site_filter: str | None = None,
165-
) -> list[dict]:
166-
"""Search the Web for *search_term* and return enriched result dictionaries.
167-
168-
The function handles the entire workflow:
169-
170-
1. Perform a Google Custom Search using the provided credentials.
171-
2. Retrieve and clean the contents of each result page.
172-
3. Generate a concise summary of each page focused on *search_term* using
173-
:pyfunc:`summarize_content`.
174-
175-
The returned value is a ``list`` of ``dict`` objects with the following
176-
keys: ``order``, ``link``, ``title`` and ``Summary``.
177-
"""
178-
179-
# Step 1 – search.
180-
search_items = search(
181-
search_term,
182-
api_key=api_key,
183-
cse_id=cse_id,
184-
search_depth=search_depth,
185-
site_filter=site_filter,
186-
)
187-
188-
# Step 2 & 3 – scrape pages and summarise.
189-
# ``get_search_results`` is an *async* coroutine. Execute it and
190-
# return its result, transparently handling the presence (or absence)
191-
# of an already-running event loop (e.g. in notebooks).
192-
193-
try:
194-
# Prefer ``asyncio.run`` which creates and manages a fresh event
195-
# loop. This is the most robust option for regular Python
196-
# scripts.
197-
import asyncio # local import to avoid polluting module top-level
198-
199-
return asyncio.run(
200-
get_search_results(search_items, search_term, character_limit)
201-
)
202-
except RuntimeError:
203-
# We probably find ourselves inside an existing event loop (for
204-
# instance when this helper is invoked from within a Jupyter
205-
# notebook). Fall back to re-using the current loop.
206-
loop = asyncio.get_event_loop()
207-
return loop.run_until_complete(
208-
get_search_results(search_items, search_term, character_limit)
209-
)
42+
search_term: str, *, search_context_size: str = "high"
43+
) -> dict:
44+
return openai_web_search(search_term, search_context_size=search_context_size)

0 commit comments

Comments
 (0)