use openai web saerch, update models used (#1894)

moustafa-openai · web-flow · commit 5e9a1b1afe5a · 2025-06-11T23:56:37.000-07:00
diff --git a/authors.yaml b/authors.yaml
@@ -3,6 +3,11 @@
 # You can optionally customize how your information shows up cookbook.openai.com over here.
 # If your information is not present here, it will be pulled from your GitHub profile.
 
+moustafa-openai:
+  name: "Moustafa Elhadary"
+  website: "https://www.linkedin.com/in/moustafaelhadary/"
+  avatar: "https://avatars.githubusercontent.com/u/198829901?v=4"
+
 theophile-openai:
   name: "Theophile Sautory"
   website: "https://www.linkedin.com/in/theophilesautory"
diff --git a/examples/agents_sdk/AI_Research_Assistant_Cookbook.ipynb b/examples/agents_sdk/AI_Research_Assistant_Cookbook.ipynb
diff --git a/examples/agents_sdk/REPORT_DRAFT.md b/examples/agents_sdk/REPORT_DRAFT.md
diff --git a/examples/agents_sdk/ai_research_assistant_resources/agents_tools_registry/query_expansion_agent.py b/examples/agents_sdk/ai_research_assistant_resources/agents_tools_registry/query_expansion_agent.py
@@ -53,7 +53,7 @@ class QueryExpansionAgent:
 
     """
 
-    def __init__(self, *, model: str = "o3-mini", tools: list | None = None, name: str | None = None,
+    def __init__(self, *, model: str = "o4-mini", tools: list | None = None, name: str | None = None,
                  instructions: str | None = None, input_guardrails: list | None = None):
 
         # Initialise the underlying `agents.Agent` with a structured `output_type` so it
diff --git a/examples/agents_sdk/ai_research_assistant_resources/agents_tools_registry/web_page_summary_agent.py b/examples/agents_sdk/ai_research_assistant_resources/agents_tools_registry/web_page_summary_agent.py
@@ -17,7 +17,7 @@ def __init__(
         search_term: str,
         character_limit: int = 1000,
         *,
-        model: str = "gpt-4o",
+        model: str = "gpt-4.1",
         tools: list | None = None,
         name: str | None = None,
         instructions: str | None = None,
diff --git a/examples/agents_sdk/ai_research_assistant_resources/agents_tools_registry/web_search_terms_generation_agent.py b/examples/agents_sdk/ai_research_assistant_resources/agents_tools_registry/web_search_terms_generation_agent.py
@@ -36,7 +36,7 @@ def __init__(
         self,
         num_search_terms: int = _NUM_SEARCH_TERMS,
         *, 
-        model: str = "gpt-4o",
+        model: str = "gpt-4.1",
         tools: list | None = None,
         name: str | None = None,
         instructions: str | None = None,
diff --git a/examples/agents_sdk/ai_research_assistant_resources/guardrails/topic_content_guardrail.py b/examples/agents_sdk/ai_research_assistant_resources/guardrails/topic_content_guardrail.py
@@ -16,10 +16,13 @@
 # 1. Tiny classifier agent → “Is this prompt about AI?”
 # ---------------------------------------------------------------------------
 
+
 class TopicCheckOutput(BaseModel):
     """Structured result returned by the classifier."""
-    is_about_ai: bool          # True → prompt is AI-related
-    reasoning: str             # short rationale (useful for logs)
+
+    is_about_ai: bool  # True → prompt is AI-related
+    reasoning: str  # short rationale (useful for logs)
+
 
 topic_guardrail_agent = Agent(
     name="Topic guardrail (AI)",
@@ -30,16 +33,17 @@ class TopicCheckOutput(BaseModel):
         "policy, or market trends. "
         "Return is_about_ai = false for all other domains (finance, biology, history, etc.)."
     ),
-    model="gpt-4o-mini",           # lightweight, fast
+    model="gpt-4.1-mini",  # lightweight, fast
     output_type=TopicCheckOutput,
 )
 
 # ---------------------------------------------------------------------------
 # 2. Guardrail function (decorated) that wraps the classifier
 # ---------------------------------------------------------------------------
 
+
 @input_guardrail
-async def ai_topic_guardrail(           
+async def ai_topic_guardrail(
     ctx: RunContextWrapper[None],
     agent: Agent,
     input: str | List[TResponseInputItem],
@@ -53,5 +57,6 @@ async def ai_topic_guardrail(
 
     return output
 
+
 # Optional: tidy public surface
-__all__ = ["ai_topic_guardrail", "TopicCheckOutput"]
+__all__ = ["ai_topic_guardrail", "TopicCheckOutput"]
diff --git a/examples/agents_sdk/ai_research_assistant_resources/utils/web_search_and_util.py b/examples/agents_sdk/ai_research_assistant_resources/utils/web_search_and_util.py
@@ -1,209 +1,44 @@
-# web_search_and_util.py
-
-from bs4 import BeautifulSoup
-import requests
-from dotenv import load_dotenv
-import os
-
-load_dotenv('.env')
-
-api_key = os.getenv('API_KEY')
-cse_id = os.getenv('CSE_ID')
-
-TRUNCATE_SCRAPED_TEXT = 50000  # Adjust based on your model's context window
-SEARCH_DEPTH = 2  # Default depth for Google Custom Search queries
-
-# ------------------------------------------------------------------
-# Optional: patch asyncio to allow nested event loops (e.g., inside Jupyter)
-# ------------------------------------------------------------------
-
-try:
-    import nest_asyncio  # type: ignore
-
-    # ``nest_asyncio`` monkey-patches the running event-loop so that further
-    # calls to ``asyncio.run`` or ``loop.run_until_complete`` do **not** raise
-    # ``RuntimeError: This event loop is already running``.  This makes the
-    # synchronous helper functions below safe to call in notebook cells while
-    # still working unchanged in regular Python scripts.
-
-    nest_asyncio.apply()
-except ImportError:  # pragma: no cover
-    # ``nest_asyncio`` is an optional dependency.  If it is unavailable we
-    # simply skip patching – the helper functions will still work in regular
-    # Python scripts but may raise ``RuntimeError`` when called from within
-    # environments that already run an event-loop (e.g., Jupyter).
-    pass
-
-def search(search_item, api_key, cse_id, search_depth=SEARCH_DEPTH, site_filter=None):
-    service_url = 'https://www.googleapis.com/customsearch/v1'
-
-    params = {
-        'q': search_item,
-        'key': api_key,
-        'cx': cse_id,
-        'num': search_depth
-    }
-    
-    if api_key is None or cse_id is None:
-        raise ValueError("API key and CSE ID are required")
-
-    try:
-        response = requests.get(service_url, params=params)
-        response.raise_for_status()
-        results = response.json()
-
-        # ------------------------------------------------------------------
-        # Robust handling – always return a *list* (never ``None``)
-        # ------------------------------------------------------------------
-        items = results.get("items", [])
-
-        # Optional site filtering
-        if site_filter:
-            items = [itm for itm in items if site_filter in itm.get("link", "")]
-            if not items:
-                print(f"No results with {site_filter} found.")
-
-        # Graceful handling of empty results
-        if not items:
-            print("No search results found.")
-            return []
-
-        return items
-
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred during the search: {e}")
-        return []
-
-
-
-
-def retrieve_content(url, max_tokens=TRUNCATE_SCRAPED_TEXT):
-        try:
-            headers = {'User-Agent': 'Mozilla/5.0'}
-            response = requests.get(url, headers=headers, timeout=10)
-            response.raise_for_status()
-
-            soup = BeautifulSoup(response.content, 'html.parser')
-            for script_or_style in soup(['script', 'style']):
-                script_or_style.decompose()
-
-            text = soup.get_text(separator=' ', strip=True)
-            characters = max_tokens * 4  # Approximate conversion
-            text = text[:characters]
-            return text
-        except requests.exceptions.RequestException as e:
-            print(f"Failed to retrieve {url}: {e}")
-            return None
-        
-
-
-async def get_search_results(search_items, search_term: str, character_limit: int = 500):
-    # Generate a summary of search results for the given search term
-    results_list = []
-    for idx, item in enumerate(search_items, start=1):
-        url = item.get('link')
-        
-        snippet = item.get('snippet', '')
-        web_content = retrieve_content(url, TRUNCATE_SCRAPED_TEXT)
-        
-        if web_content is None:
-            print(f"Error: skipped URL: {url}")
-        else:
-            summary = summarize_content(web_content, search_term, character_limit)
-            result_dict = {
-                'order': idx,
-                'link': url,
-                'title': snippet,
-                'Summary': summary
-            }
-            results_list.append(result_dict)
-    return results_list
-
-# ------------------------------------------------------------------
-# Helper using WebPageSummaryAgent for content summarisation
-# ------------------------------------------------------------------
-# NOTE:
-# ``WebPageSummaryAgent`` is an agent wrapper that internally spins up an
-# ``agents.Agent`` instance with the correct system prompt for Web page
-# summarisation.  Because the ``task`` method on the wrapper is *async*, we
-# provide a small synchronous wrapper that takes care of running the coroutine
-# irrespective of whether the caller is inside an active event-loop (e.g.
-# Jupyter notebooks) or not.
-
-from ai_research_assistant_resources.agents_tools_registry.web_page_summary_agent import WebPageSummaryAgent
-import asyncio
-
-
-def summarize_content(content: str, search_term: str, character_limit: int = 2000) -> str:  # noqa: D401
-
-    # Instantiate the agent with the dynamic instructions.
-    agent = WebPageSummaryAgent(search_term=search_term, character_limit=character_limit)
+from openai import OpenAI
+
+client = OpenAI()
+
+
+def openai_web_search(
+    query: str,
+    model: str = "gpt-4.1",
+    search_context_size: str = "high",
+) -> dict:
+    resp = client.responses.create(
+        model=model,
+        tools=[
+            {"type": "web_search_preview", "search_context_size": search_context_size}
+        ],
+        input=f"Search the web for the following information and provide citations: {query}",
+    )
 
-    # Run the agent task, making sure we properly handle the presence (or
-    # absence) of an already-running event-loop.
-    try:
-        return asyncio.run(agent.task(content))
-    except RuntimeError:
-        # We are *probably* inside an existing event-loop (common in notebooks
-        # or async frameworks).  In that case fall back to using the current
-        # loop instead of creating a new one.
-        loop = asyncio.get_event_loop()
-        return loop.run_until_complete(agent.task(content))
+    answer = ""
+    citations = []
 
+    for item in resp.output:
+        if item.type == "message":
+            for part in item.content:
+                if part.type == "output_text":
+                    answer = part.text
+                    for ann in part.annotations or []:
+                        if ann.type == "url_citation":
+                            citations.append(
+                                {
+                                    "url": ann.url,
+                                    "title": getattr(ann, "title", None),
+                                    "start_index": getattr(ann, "start_index", None),
+                                    "end_index": getattr(ann, "end_index", None),
+                                }
+                            )
 
-# ------------------------------------------------------------------
-# High-level convenience API
-# ------------------------------------------------------------------
+    return {"answer": answer, "citations": citations}
 
 
 def get_results_for_search_term(
-    search_term: str,
-    *,
-    character_limit: int = 2000,
-    search_depth: int = SEARCH_DEPTH,
-    site_filter: str | None = None,
-) -> list[dict]:
-    """Search the Web for *search_term* and return enriched result dictionaries.
-
-    The function handles the entire workflow:
-
-    1. Perform a Google Custom Search using the provided credentials.
-    2. Retrieve and clean the contents of each result page.
-    3. Generate a concise summary of each page focused on *search_term* using
-       :pyfunc:`summarize_content`.
-
-    The returned value is a ``list`` of ``dict`` objects with the following
-    keys: ``order``, ``link``, ``title`` and ``Summary``.
-    """
-
-    # Step 1 – search.
-    search_items = search(
-        search_term,
-        api_key=api_key,
-        cse_id=cse_id,
-        search_depth=search_depth,
-        site_filter=site_filter,
-    )
-
-    # Step 2 & 3 – scrape pages and summarise.
-    # ``get_search_results`` is an *async* coroutine.  Execute it and
-    # return its result, transparently handling the presence (or absence)
-    # of an already-running event loop (e.g. in notebooks).
-
-    try:
-        # Prefer ``asyncio.run`` which creates and manages a fresh event
-        # loop.  This is the most robust option for regular Python
-        # scripts.
-        import asyncio  # local import to avoid polluting module top-level
-
-        return asyncio.run(
-            get_search_results(search_items, search_term, character_limit)
-        )
-    except RuntimeError:
-        # We probably find ourselves inside an existing event loop (for
-        # instance when this helper is invoked from within a Jupyter
-        # notebook).  Fall back to re-using the current loop.
-        loop = asyncio.get_event_loop()
-        return loop.run_until_complete(
-            get_search_results(search_items, search_term, character_limit)
-        )
+    search_term: str, *, search_context_size: str = "high"
+) -> dict:
+    return openai_web_search(search_term, search_context_size=search_context_size)
diff --git a/examples/agents_sdk/research_results.json b/examples/agents_sdk/research_results.json