1
- # web_search_and_util.py
2
-
3
- from bs4 import BeautifulSoup
4
- import requests
5
- from dotenv import load_dotenv
6
- import os
7
-
8
- load_dotenv ('.env' )
9
-
10
- api_key = os .getenv ('API_KEY' )
11
- cse_id = os .getenv ('CSE_ID' )
12
-
13
- TRUNCATE_SCRAPED_TEXT = 50000 # Adjust based on your model's context window
14
- SEARCH_DEPTH = 2 # Default depth for Google Custom Search queries
15
-
16
- # ------------------------------------------------------------------
17
- # Optional: patch asyncio to allow nested event loops (e.g., inside Jupyter)
18
- # ------------------------------------------------------------------
19
-
20
- try :
21
- import nest_asyncio # type: ignore
22
-
23
- # ``nest_asyncio`` monkey-patches the running event-loop so that further
24
- # calls to ``asyncio.run`` or ``loop.run_until_complete`` do **not** raise
25
- # ``RuntimeError: This event loop is already running``. This makes the
26
- # synchronous helper functions below safe to call in notebook cells while
27
- # still working unchanged in regular Python scripts.
28
-
29
- nest_asyncio .apply ()
30
- except ImportError : # pragma: no cover
31
- # ``nest_asyncio`` is an optional dependency. If it is unavailable we
32
- # simply skip patching – the helper functions will still work in regular
33
- # Python scripts but may raise ``RuntimeError`` when called from within
34
- # environments that already run an event-loop (e.g., Jupyter).
35
- pass
36
-
37
- def search (search_item , api_key , cse_id , search_depth = SEARCH_DEPTH , site_filter = None ):
38
- service_url = 'https://www.googleapis.com/customsearch/v1'
39
-
40
- params = {
41
- 'q' : search_item ,
42
- 'key' : api_key ,
43
- 'cx' : cse_id ,
44
- 'num' : search_depth
45
- }
46
-
47
- if api_key is None or cse_id is None :
48
- raise ValueError ("API key and CSE ID are required" )
49
-
50
- try :
51
- response = requests .get (service_url , params = params )
52
- response .raise_for_status ()
53
- results = response .json ()
54
-
55
- # ------------------------------------------------------------------
56
- # Robust handling – always return a *list* (never ``None``)
57
- # ------------------------------------------------------------------
58
- items = results .get ("items" , [])
59
-
60
- # Optional site filtering
61
- if site_filter :
62
- items = [itm for itm in items if site_filter in itm .get ("link" , "" )]
63
- if not items :
64
- print (f"No results with { site_filter } found." )
65
-
66
- # Graceful handling of empty results
67
- if not items :
68
- print ("No search results found." )
69
- return []
70
-
71
- return items
72
-
73
- except requests .exceptions .RequestException as e :
74
- print (f"An error occurred during the search: { e } " )
75
- return []
76
-
77
-
78
-
79
-
80
- def retrieve_content (url , max_tokens = TRUNCATE_SCRAPED_TEXT ):
81
- try :
82
- headers = {'User-Agent' : 'Mozilla/5.0' }
83
- response = requests .get (url , headers = headers , timeout = 10 )
84
- response .raise_for_status ()
85
-
86
- soup = BeautifulSoup (response .content , 'html.parser' )
87
- for script_or_style in soup (['script' , 'style' ]):
88
- script_or_style .decompose ()
89
-
90
- text = soup .get_text (separator = ' ' , strip = True )
91
- characters = max_tokens * 4 # Approximate conversion
92
- text = text [:characters ]
93
- return text
94
- except requests .exceptions .RequestException as e :
95
- print (f"Failed to retrieve { url } : { e } " )
96
- return None
97
-
98
-
99
-
100
- async def get_search_results (search_items , search_term : str , character_limit : int = 500 ):
101
- # Generate a summary of search results for the given search term
102
- results_list = []
103
- for idx , item in enumerate (search_items , start = 1 ):
104
- url = item .get ('link' )
105
-
106
- snippet = item .get ('snippet' , '' )
107
- web_content = retrieve_content (url , TRUNCATE_SCRAPED_TEXT )
108
-
109
- if web_content is None :
110
- print (f"Error: skipped URL: { url } " )
111
- else :
112
- summary = summarize_content (web_content , search_term , character_limit )
113
- result_dict = {
114
- 'order' : idx ,
115
- 'link' : url ,
116
- 'title' : snippet ,
117
- 'Summary' : summary
118
- }
119
- results_list .append (result_dict )
120
- return results_list
121
-
122
- # ------------------------------------------------------------------
123
- # Helper using WebPageSummaryAgent for content summarisation
124
- # ------------------------------------------------------------------
125
- # NOTE:
126
- # ``WebPageSummaryAgent`` is an agent wrapper that internally spins up an
127
- # ``agents.Agent`` instance with the correct system prompt for Web page
128
- # summarisation. Because the ``task`` method on the wrapper is *async*, we
129
- # provide a small synchronous wrapper that takes care of running the coroutine
130
- # irrespective of whether the caller is inside an active event-loop (e.g.
131
- # Jupyter notebooks) or not.
132
-
133
- from ai_research_assistant_resources .agents_tools_registry .web_page_summary_agent import WebPageSummaryAgent
134
- import asyncio
135
-
136
-
137
- def summarize_content (content : str , search_term : str , character_limit : int = 2000 ) -> str : # noqa: D401
138
-
139
- # Instantiate the agent with the dynamic instructions.
140
- agent = WebPageSummaryAgent (search_term = search_term , character_limit = character_limit )
1
+ from openai import OpenAI
2
+
3
+ client = OpenAI ()
4
+
5
+
6
+ def openai_web_search (
7
+ query : str ,
8
+ model : str = "gpt-4.1" ,
9
+ search_context_size : str = "high" ,
10
+ ) -> dict :
11
+ resp = client .responses .create (
12
+ model = model ,
13
+ tools = [
14
+ {"type" : "web_search_preview" , "search_context_size" : search_context_size }
15
+ ],
16
+ input = f"Search the web for the following information and provide citations: { query } " ,
17
+ )
141
18
142
- # Run the agent task, making sure we properly handle the presence (or
143
- # absence) of an already-running event-loop.
144
- try :
145
- return asyncio .run (agent .task (content ))
146
- except RuntimeError :
147
- # We are *probably* inside an existing event-loop (common in notebooks
148
- # or async frameworks). In that case fall back to using the current
149
- # loop instead of creating a new one.
150
- loop = asyncio .get_event_loop ()
151
- return loop .run_until_complete (agent .task (content ))
19
+ answer = ""
20
+ citations = []
152
21
22
+ for item in resp .output :
23
+ if item .type == "message" :
24
+ for part in item .content :
25
+ if part .type == "output_text" :
26
+ answer = part .text
27
+ for ann in part .annotations or []:
28
+ if ann .type == "url_citation" :
29
+ citations .append (
30
+ {
31
+ "url" : ann .url ,
32
+ "title" : getattr (ann , "title" , None ),
33
+ "start_index" : getattr (ann , "start_index" , None ),
34
+ "end_index" : getattr (ann , "end_index" , None ),
35
+ }
36
+ )
153
37
154
- # ------------------------------------------------------------------
155
- # High-level convenience API
156
- # ------------------------------------------------------------------
38
+ return {"answer" : answer , "citations" : citations }
157
39
158
40
159
41
def get_results_for_search_term (
160
- search_term : str ,
161
- * ,
162
- character_limit : int = 2000 ,
163
- search_depth : int = SEARCH_DEPTH ,
164
- site_filter : str | None = None ,
165
- ) -> list [dict ]:
166
- """Search the Web for *search_term* and return enriched result dictionaries.
167
-
168
- The function handles the entire workflow:
169
-
170
- 1. Perform a Google Custom Search using the provided credentials.
171
- 2. Retrieve and clean the contents of each result page.
172
- 3. Generate a concise summary of each page focused on *search_term* using
173
- :pyfunc:`summarize_content`.
174
-
175
- The returned value is a ``list`` of ``dict`` objects with the following
176
- keys: ``order``, ``link``, ``title`` and ``Summary``.
177
- """
178
-
179
- # Step 1 – search.
180
- search_items = search (
181
- search_term ,
182
- api_key = api_key ,
183
- cse_id = cse_id ,
184
- search_depth = search_depth ,
185
- site_filter = site_filter ,
186
- )
187
-
188
- # Step 2 & 3 – scrape pages and summarise.
189
- # ``get_search_results`` is an *async* coroutine. Execute it and
190
- # return its result, transparently handling the presence (or absence)
191
- # of an already-running event loop (e.g. in notebooks).
192
-
193
- try :
194
- # Prefer ``asyncio.run`` which creates and manages a fresh event
195
- # loop. This is the most robust option for regular Python
196
- # scripts.
197
- import asyncio # local import to avoid polluting module top-level
198
-
199
- return asyncio .run (
200
- get_search_results (search_items , search_term , character_limit )
201
- )
202
- except RuntimeError :
203
- # We probably find ourselves inside an existing event loop (for
204
- # instance when this helper is invoked from within a Jupyter
205
- # notebook). Fall back to re-using the current loop.
206
- loop = asyncio .get_event_loop ()
207
- return loop .run_until_complete (
208
- get_search_results (search_items , search_term , character_limit )
209
- )
42
+ search_term : str , * , search_context_size : str = "high"
43
+ ) -> dict :
44
+ return openai_web_search (search_term , search_context_size = search_context_size )
0 commit comments