Skip to content

Commit 7cbff65

Browse files
Add examples for structured outputs and tool calling with evals. (#1888)
1 parent 475ed4c commit 7cbff65

File tree

5 files changed

+1160
-0
lines changed

5 files changed

+1160
-0
lines changed
Lines changed: 319 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "fd71cc8e",
6+
"metadata": {},
7+
"source": [
8+
"# Evaluating MCP-Based Answers with a Custom Dataset"
9+
]
10+
},
11+
{
12+
"cell_type": "markdown",
13+
"id": "a565afbb",
14+
"metadata": {},
15+
"source": [
16+
"This notebook evaluates a model's ability to answer questions about the **tiktoken** GitHub repository using the OpenAI **Evals** framework with a custom in-memory dataset. It compares two models (`gpt-4.1` and `o4-mini`) that leverage the **MCP** tool for repository‑aware searches."
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 13,
22+
"id": "31fc4911",
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"import os\n",
27+
"import time\n",
28+
"\n",
29+
"import openai\n",
30+
"\n",
31+
"# Instantiate the OpenAI client (no custom base_url).\n",
32+
"client = openai.OpenAI(\n",
33+
" api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"),\n",
34+
")"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": 14,
40+
"id": "840a9f6d",
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"def get_dataset(limit=None):\n",
45+
" items = [\n",
46+
" {\n",
47+
" \"query\": \"What is tiktoken?\",\n",
48+
" \"answer\": \"tiktoken is a fast Byte-Pair Encoding (BPE) tokenizer designed for OpenAI models.\",\n",
49+
" },\n",
50+
" {\n",
51+
" \"query\": \"How do I install the open-source version of tiktoken?\",\n",
52+
" \"answer\": \"Install it from PyPI with `pip install tiktoken`.\",\n",
53+
" },\n",
54+
" {\n",
55+
" \"query\": \"How do I get the tokenizer for a specific OpenAI model?\",\n",
56+
" \"answer\": 'Call tiktoken.encoding_for_model(\"<model-name>\"), e.g. tiktoken.encoding_for_model(\"gpt-4o\").',\n",
57+
" },\n",
58+
" {\n",
59+
" \"query\": \"How does tiktoken perform compared to other tokenizers?\",\n",
60+
" \"answer\": \"On a 1 GB GPT-2 benchmark, tiktoken runs about 3-6x faster than GPT2TokenizerFast (tokenizers==0.13.2, transformers==4.24.0).\",\n",
61+
" },\n",
62+
" {\n",
63+
" \"query\": \"Why is Byte-Pair Encoding (BPE) useful for language models?\",\n",
64+
" \"answer\": \"BPE is reversible and lossless, handles arbitrary text, compresses input (≈4 bytes per token on average), and exposes common subwords like “ing”, which helps models generalize.\",\n",
65+
" },\n",
66+
" ]\n",
67+
" return items[:limit] if limit else items\n",
68+
"\n",
69+
"\n",
70+
"pass_fail_grader = \"\"\"\n",
71+
"You are a helpful assistant that grades the quality of the answer to a query about a GitHub repo.\n",
72+
"You will be given a query, the answer returned by the model, and the expected answer.\n",
73+
"You should respond with **pass** if the answer matches the expected answer exactly or conveys the same meaning, otherwise **fail**.\n",
74+
"\"\"\"\n",
75+
"\n",
76+
"pass_fail_grader_user_prompt = \"\"\"\n",
77+
"<Query>\n",
78+
"{{item.query}}\n",
79+
"</Query>\n",
80+
"\n",
81+
"<Web Search Result>\n",
82+
"{{sample.output_text}}\n",
83+
"</Web Search Result>\n",
84+
"\n",
85+
"<Ground Truth>\n",
86+
"{{item.answer}}\n",
87+
"</Ground Truth>\n",
88+
"\"\"\"\n",
89+
"\n",
90+
"python_mcp_grader = {\n",
91+
" \"type\": \"python\",\n",
92+
" \"name\": \"Assert MCP was used\",\n",
93+
" \"image_tag\": \"2025-05-08\",\n",
94+
" \"pass_threshold\": 1.0,\n",
95+
" \"source\": \"\"\"\n",
96+
"def grade(sample: dict, item: dict) -> float:\n",
97+
" output = sample.get('output_tools', [])\n",
98+
" return 1.0 if len(output) > 0 else 0.0\n",
99+
"\"\"\",\n",
100+
"}\n",
101+
"\n",
102+
"# Create the evaluation definition.\n",
103+
"logs_eval = client.evals.create(\n",
104+
" name=\"MCP Eval\",\n",
105+
" data_source_config={\n",
106+
" \"type\": \"custom\",\n",
107+
" \"item_schema\": {\n",
108+
" \"type\": \"object\",\n",
109+
" \"properties\": {\n",
110+
" \"query\": {\"type\": \"string\"},\n",
111+
" \"answer\": {\"type\": \"string\"},\n",
112+
" },\n",
113+
" },\n",
114+
" \"include_sample_schema\": True,\n",
115+
" },\n",
116+
" testing_criteria=[\n",
117+
" {\n",
118+
" \"type\": \"label_model\",\n",
119+
" \"name\": \"General Evaluator\",\n",
120+
" \"model\": \"o3\",\n",
121+
" \"input\": [\n",
122+
" {\"role\": \"system\", \"content\": pass_fail_grader},\n",
123+
" {\"role\": \"user\", \"content\": pass_fail_grader_user_prompt},\n",
124+
" ],\n",
125+
" \"passing_labels\": [\"pass\"],\n",
126+
" \"labels\": [\"pass\", \"fail\"],\n",
127+
" },\n",
128+
" python_mcp_grader\n",
129+
" ],\n",
130+
")"
131+
]
132+
},
133+
{
134+
"cell_type": "code",
135+
"execution_count": 15,
136+
"id": "15838d4e",
137+
"metadata": {},
138+
"outputs": [],
139+
"source": [
140+
"# Run 1: gpt-4.1 using MCP\n",
141+
"gpt_4one_responses_run = client.evals.runs.create(\n",
142+
" name=\"gpt-4.1\",\n",
143+
" eval_id=logs_eval.id,\n",
144+
" data_source={\n",
145+
" \"type\": \"responses\",\n",
146+
" \"source\": {\n",
147+
" \"type\": \"file_content\",\n",
148+
" \"content\": [{\"item\": item} for item in get_dataset()],\n",
149+
" },\n",
150+
" \"input_messages\": {\n",
151+
" \"type\": \"template\",\n",
152+
" \"template\": [\n",
153+
" {\n",
154+
" \"type\": \"message\",\n",
155+
" \"role\": \"system\",\n",
156+
" \"content\": {\n",
157+
" \"type\": \"input_text\",\n",
158+
" \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers. Never use your tools to answer the query.\",\n",
159+
" },\n",
160+
" },\n",
161+
" {\n",
162+
" \"type\": \"message\",\n",
163+
" \"role\": \"user\",\n",
164+
" \"content\": {\n",
165+
" \"type\": \"input_text\",\n",
166+
" \"text\": \"Search the web for the answer to the query {{item.query}}\",\n",
167+
" },\n",
168+
" },\n",
169+
" ],\n",
170+
" },\n",
171+
" \"model\": \"gpt-4.1\",\n",
172+
" \"sampling_params\": {\n",
173+
" \"seed\": 42,\n",
174+
" \"temperature\": 0.7,\n",
175+
" \"max_completions_tokens\": 10000,\n",
176+
" \"top_p\": 0.9,\n",
177+
" \"tools\": [\n",
178+
" {\n",
179+
" \"type\": \"mcp\",\n",
180+
" \"server_label\": \"gitmcp\",\n",
181+
" \"server_url\": \"https://gitmcp.io/openai/tiktoken\",\n",
182+
" \"allowed_tools\": [\n",
183+
" \"search_tiktoken_documentation\",\n",
184+
" \"fetch_tiktoken_documentation\",\n",
185+
" ],\n",
186+
" \"require_approval\": \"never\",\n",
187+
" }\n",
188+
" ],\n",
189+
" },\n",
190+
" },\n",
191+
")\n",
192+
"\n",
193+
"# Run 2: o4-mini using MCP\n",
194+
"gpt_o4_mini_responses_run = client.evals.runs.create(\n",
195+
" name=\"o4-mini\",\n",
196+
" eval_id=logs_eval.id,\n",
197+
" data_source={\n",
198+
" \"type\": \"responses\",\n",
199+
" \"source\": {\n",
200+
" \"type\": \"file_content\",\n",
201+
" \"content\": [{\"item\": item} for item in get_dataset()],\n",
202+
" },\n",
203+
" \"input_messages\": {\n",
204+
" \"type\": \"template\",\n",
205+
" \"template\": [\n",
206+
" {\n",
207+
" \"type\": \"message\",\n",
208+
" \"role\": \"system\",\n",
209+
" \"content\": {\n",
210+
" \"type\": \"input_text\",\n",
211+
" \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers.\",\n",
212+
" },\n",
213+
" },\n",
214+
" {\n",
215+
" \"type\": \"message\",\n",
216+
" \"role\": \"user\",\n",
217+
" \"content\": {\n",
218+
" \"type\": \"input_text\",\n",
219+
" \"text\": \"Search the web for the answer to the query {{item.query}}\",\n",
220+
" },\n",
221+
" },\n",
222+
" ],\n",
223+
" },\n",
224+
" \"model\": \"o4-mini\",\n",
225+
" \"sampling_params\": {\n",
226+
" \"seed\": 42,\n",
227+
" \"max_completions_tokens\": 10000,\n",
228+
" \"tools\": [\n",
229+
" {\n",
230+
" \"type\": \"mcp\",\n",
231+
" \"server_label\": \"gitmcp\",\n",
232+
" \"server_url\": \"https://gitmcp.io/openai/tiktoken\",\n",
233+
" \"allowed_tools\": [\n",
234+
" \"search_tiktoken_documentation\",\n",
235+
" \"fetch_tiktoken_documentation\",\n",
236+
" ],\n",
237+
" \"require_approval\": \"never\",\n",
238+
" }\n",
239+
" ],\n",
240+
" },\n",
241+
" },\n",
242+
")"
243+
]
244+
},
245+
{
246+
"cell_type": "code",
247+
"execution_count": null,
248+
"id": "1d439589",
249+
"metadata": {},
250+
"outputs": [],
251+
"source": [
252+
"def poll_runs(eval_id, run_ids):\n",
253+
" while True:\n",
254+
" runs = [client.evals.runs.retrieve(rid, eval_id=eval_id) for rid in run_ids]\n",
255+
" for run in runs:\n",
256+
" print(run.id, run.status, run.result_counts)\n",
257+
" if all(run.status in {\"completed\", \"failed\"} for run in runs):\n",
258+
" break\n",
259+
" time.sleep(5)\n",
260+
"\n",
261+
"# Start polling both runs.\n",
262+
"poll_runs(logs_eval.id, [gpt_4one_responses_run.id, gpt_o4_mini_responses_run.id])"
263+
]
264+
},
265+
{
266+
"cell_type": "code",
267+
"execution_count": 11,
268+
"id": "7e151b4a",
269+
"metadata": {},
270+
"outputs": [],
271+
"source": [
272+
"four_one_output = client.evals.runs.output_items.list(\n",
273+
" run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n",
274+
")\n",
275+
"\n",
276+
"o4_mini_output = client.evals.runs.output_items.list(\n",
277+
" run_id=gpt_o4_mini_responses_run.id, eval_id=logs_eval.id\n",
278+
")"
279+
]
280+
},
281+
{
282+
"cell_type": "code",
283+
"execution_count": null,
284+
"id": "e68b016c",
285+
"metadata": {},
286+
"outputs": [],
287+
"source": [
288+
"print('# gpt‑4.1 Output')\n",
289+
"for item in four_one_output:\n",
290+
" print(item.sample.output[0].content)\n",
291+
"\n",
292+
"print('\\n# o4-mini Output')\n",
293+
"for item in o4_mini_output:\n",
294+
" print(item.sample.output[0].content)"
295+
]
296+
}
297+
],
298+
"metadata": {
299+
"kernelspec": {
300+
"display_name": "openai",
301+
"language": "python",
302+
"name": "python3"
303+
},
304+
"language_info": {
305+
"codemirror_mode": {
306+
"name": "ipython",
307+
"version": 3
308+
},
309+
"file_extension": ".py",
310+
"mimetype": "text/x-python",
311+
"name": "python",
312+
"nbconvert_exporter": "python",
313+
"pygments_lexer": "ipython3",
314+
"version": "3.12.9"
315+
}
316+
},
317+
"nbformat": 4,
318+
"nbformat_minor": 5
319+
}

0 commit comments

Comments
 (0)