Skip to content

use openai web saerch, update models used #1894

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions authors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
# You can optionally customize how your information shows up cookbook.openai.com over here.
# If your information is not present here, it will be pulled from your GitHub profile.

moustafa-openai:
name: "Moustafa Elhadary"
website: "https://www.linkedin.com/in/moustafaelhadary/"
avatar: "https://avatars.githubusercontent.com/u/198829901?v=4"

theophile-openai:
name: "Theophile Sautory"
website: "https://www.linkedin.com/in/theophilesautory"
Expand Down
143 changes: 69 additions & 74 deletions examples/agents_sdk/AI_Research_Assistant_Cookbook.ipynb

Large diffs are not rendered by default.

66 changes: 35 additions & 31 deletions examples/agents_sdk/REPORT_DRAFT.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class QueryExpansionAgent:

"""

def __init__(self, *, model: str = "o3-mini", tools: list | None = None, name: str | None = None,
def __init__(self, *, model: str = "o4-mini", tools: list | None = None, name: str | None = None,
instructions: str | None = None, input_guardrails: list | None = None):

# Initialise the underlying `agents.Agent` with a structured `output_type` so it
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(
search_term: str,
character_limit: int = 1000,
*,
model: str = "gpt-4o",
model: str = "gpt-4.1",
tools: list | None = None,
name: str | None = None,
instructions: str | None = None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(
self,
num_search_terms: int = _NUM_SEARCH_TERMS,
*,
model: str = "gpt-4o",
model: str = "gpt-4.1",
tools: list | None = None,
name: str | None = None,
instructions: str | None = None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@
# 1. Tiny classifier agent → “Is this prompt about AI?”
# ---------------------------------------------------------------------------


class TopicCheckOutput(BaseModel):
"""Structured result returned by the classifier."""
is_about_ai: bool # True → prompt is AI-related
reasoning: str # short rationale (useful for logs)

is_about_ai: bool # True → prompt is AI-related
reasoning: str # short rationale (useful for logs)


topic_guardrail_agent = Agent(
name="Topic guardrail (AI)",
Expand All @@ -30,16 +33,17 @@ class TopicCheckOutput(BaseModel):
"policy, or market trends. "
"Return is_about_ai = false for all other domains (finance, biology, history, etc.)."
),
model="gpt-4o-mini", # lightweight, fast
model="gpt-4.1-mini", # lightweight, fast
output_type=TopicCheckOutput,
)

# ---------------------------------------------------------------------------
# 2. Guardrail function (decorated) that wraps the classifier
# ---------------------------------------------------------------------------


@input_guardrail
async def ai_topic_guardrail(
async def ai_topic_guardrail(
ctx: RunContextWrapper[None],
agent: Agent,
input: str | List[TResponseInputItem],
Expand All @@ -53,5 +57,6 @@ async def ai_topic_guardrail(

return output


# Optional: tidy public surface
__all__ = ["ai_topic_guardrail", "TopicCheckOutput"]
__all__ = ["ai_topic_guardrail", "TopicCheckOutput"]
Original file line number Diff line number Diff line change
@@ -1,209 +1,44 @@
# web_search_and_util.py

from bs4 import BeautifulSoup
import requests
from dotenv import load_dotenv
import os

load_dotenv('.env')

api_key = os.getenv('API_KEY')
cse_id = os.getenv('CSE_ID')

TRUNCATE_SCRAPED_TEXT = 50000 # Adjust based on your model's context window
SEARCH_DEPTH = 2 # Default depth for Google Custom Search queries

# ------------------------------------------------------------------
# Optional: patch asyncio to allow nested event loops (e.g., inside Jupyter)
# ------------------------------------------------------------------

try:
import nest_asyncio # type: ignore

# ``nest_asyncio`` monkey-patches the running event-loop so that further
# calls to ``asyncio.run`` or ``loop.run_until_complete`` do **not** raise
# ``RuntimeError: This event loop is already running``. This makes the
# synchronous helper functions below safe to call in notebook cells while
# still working unchanged in regular Python scripts.

nest_asyncio.apply()
except ImportError: # pragma: no cover
# ``nest_asyncio`` is an optional dependency. If it is unavailable we
# simply skip patching – the helper functions will still work in regular
# Python scripts but may raise ``RuntimeError`` when called from within
# environments that already run an event-loop (e.g., Jupyter).
pass

def search(search_item, api_key, cse_id, search_depth=SEARCH_DEPTH, site_filter=None):
service_url = 'https://www.googleapis.com/customsearch/v1'

params = {
'q': search_item,
'key': api_key,
'cx': cse_id,
'num': search_depth
}

if api_key is None or cse_id is None:
raise ValueError("API key and CSE ID are required")

try:
response = requests.get(service_url, params=params)
response.raise_for_status()
results = response.json()

# ------------------------------------------------------------------
# Robust handling – always return a *list* (never ``None``)
# ------------------------------------------------------------------
items = results.get("items", [])

# Optional site filtering
if site_filter:
items = [itm for itm in items if site_filter in itm.get("link", "")]
if not items:
print(f"No results with {site_filter} found.")

# Graceful handling of empty results
if not items:
print("No search results found.")
return []

return items

except requests.exceptions.RequestException as e:
print(f"An error occurred during the search: {e}")
return []




def retrieve_content(url, max_tokens=TRUNCATE_SCRAPED_TEXT):
try:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.content, 'html.parser')
for script_or_style in soup(['script', 'style']):
script_or_style.decompose()

text = soup.get_text(separator=' ', strip=True)
characters = max_tokens * 4 # Approximate conversion
text = text[:characters]
return text
except requests.exceptions.RequestException as e:
print(f"Failed to retrieve {url}: {e}")
return None



async def get_search_results(search_items, search_term: str, character_limit: int = 500):
# Generate a summary of search results for the given search term
results_list = []
for idx, item in enumerate(search_items, start=1):
url = item.get('link')

snippet = item.get('snippet', '')
web_content = retrieve_content(url, TRUNCATE_SCRAPED_TEXT)

if web_content is None:
print(f"Error: skipped URL: {url}")
else:
summary = summarize_content(web_content, search_term, character_limit)
result_dict = {
'order': idx,
'link': url,
'title': snippet,
'Summary': summary
}
results_list.append(result_dict)
return results_list

# ------------------------------------------------------------------
# Helper using WebPageSummaryAgent for content summarisation
# ------------------------------------------------------------------
# NOTE:
# ``WebPageSummaryAgent`` is an agent wrapper that internally spins up an
# ``agents.Agent`` instance with the correct system prompt for Web page
# summarisation. Because the ``task`` method on the wrapper is *async*, we
# provide a small synchronous wrapper that takes care of running the coroutine
# irrespective of whether the caller is inside an active event-loop (e.g.
# Jupyter notebooks) or not.

from ai_research_assistant_resources.agents_tools_registry.web_page_summary_agent import WebPageSummaryAgent
import asyncio


def summarize_content(content: str, search_term: str, character_limit: int = 2000) -> str: # noqa: D401

# Instantiate the agent with the dynamic instructions.
agent = WebPageSummaryAgent(search_term=search_term, character_limit=character_limit)
from openai import OpenAI

client = OpenAI()


def openai_web_search(
query: str,
model: str = "gpt-4.1",
search_context_size: str = "high",
) -> dict:
resp = client.responses.create(
model=model,
tools=[
{"type": "web_search_preview", "search_context_size": search_context_size}
],
input=f"Search the web for the following information and provide citations: {query}",
)

# Run the agent task, making sure we properly handle the presence (or
# absence) of an already-running event-loop.
try:
return asyncio.run(agent.task(content))
except RuntimeError:
# We are *probably* inside an existing event-loop (common in notebooks
# or async frameworks). In that case fall back to using the current
# loop instead of creating a new one.
loop = asyncio.get_event_loop()
return loop.run_until_complete(agent.task(content))
answer = ""
citations = []

for item in resp.output:
if item.type == "message":
for part in item.content:
if part.type == "output_text":
answer = part.text
for ann in part.annotations or []:
if ann.type == "url_citation":
citations.append(
{
"url": ann.url,
"title": getattr(ann, "title", None),
"start_index": getattr(ann, "start_index", None),
"end_index": getattr(ann, "end_index", None),
}
)

# ------------------------------------------------------------------
# High-level convenience API
# ------------------------------------------------------------------
return {"answer": answer, "citations": citations}


def get_results_for_search_term(
search_term: str,
*,
character_limit: int = 2000,
search_depth: int = SEARCH_DEPTH,
site_filter: str | None = None,
) -> list[dict]:
"""Search the Web for *search_term* and return enriched result dictionaries.

The function handles the entire workflow:

1. Perform a Google Custom Search using the provided credentials.
2. Retrieve and clean the contents of each result page.
3. Generate a concise summary of each page focused on *search_term* using
:pyfunc:`summarize_content`.

The returned value is a ``list`` of ``dict`` objects with the following
keys: ``order``, ``link``, ``title`` and ``Summary``.
"""

# Step 1 – search.
search_items = search(
search_term,
api_key=api_key,
cse_id=cse_id,
search_depth=search_depth,
site_filter=site_filter,
)

# Step 2 & 3 – scrape pages and summarise.
# ``get_search_results`` is an *async* coroutine. Execute it and
# return its result, transparently handling the presence (or absence)
# of an already-running event loop (e.g. in notebooks).

try:
# Prefer ``asyncio.run`` which creates and manages a fresh event
# loop. This is the most robust option for regular Python
# scripts.
import asyncio # local import to avoid polluting module top-level

return asyncio.run(
get_search_results(search_items, search_term, character_limit)
)
except RuntimeError:
# We probably find ourselves inside an existing event loop (for
# instance when this helper is invoked from within a Jupyter
# notebook). Fall back to re-using the current loop.
loop = asyncio.get_event_loop()
return loop.run_until_complete(
get_search_results(search_items, search_term, character_limit)
)
search_term: str, *, search_context_size: str = "high"
) -> dict:
return openai_web_search(search_term, search_context_size=search_context_size)
Loading