Skip to content

Add examples for structured outputs and tool calling with evals. #1888

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
319 changes: 319 additions & 0 deletions examples/evaluation/use-cases/mcp_eval_notebook.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "fd71cc8e",
"metadata": {},
"source": [
"# Evaluating MCP-Based Answers with a Custom Dataset"
]
},
{
"cell_type": "markdown",
"id": "a565afbb",
"metadata": {},
"source": [
"This notebook evaluates a model's ability to answer questions about the **tiktoken** GitHub repository using the OpenAI **Evals** framework with a custom in-memory dataset. It compares two models (`gpt-4.1` and `o4-mini`) that leverage the **MCP** tool for repository‑aware searches."
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "31fc4911",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import time\n",
"\n",
"import openai\n",
"\n",
"# Instantiate the OpenAI client (no custom base_url).\n",
"client = openai.OpenAI(\n",
" api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "840a9f6d",
"metadata": {},
"outputs": [],
"source": [
"def get_dataset(limit=None):\n",
" items = [\n",
" {\n",
" \"query\": \"What is tiktoken?\",\n",
" \"answer\": \"tiktoken is a fast Byte-Pair Encoding (BPE) tokenizer designed for OpenAI models.\",\n",
" },\n",
" {\n",
" \"query\": \"How do I install the open-source version of tiktoken?\",\n",
" \"answer\": \"Install it from PyPI with `pip install tiktoken`.\",\n",
" },\n",
" {\n",
" \"query\": \"How do I get the tokenizer for a specific OpenAI model?\",\n",
" \"answer\": 'Call tiktoken.encoding_for_model(\"<model-name>\"), e.g. tiktoken.encoding_for_model(\"gpt-4o\").',\n",
" },\n",
" {\n",
" \"query\": \"How does tiktoken perform compared to other tokenizers?\",\n",
" \"answer\": \"On a 1 GB GPT-2 benchmark, tiktoken runs about 3-6x faster than GPT2TokenizerFast (tokenizers==0.13.2, transformers==4.24.0).\",\n",
" },\n",
" {\n",
" \"query\": \"Why is Byte-Pair Encoding (BPE) useful for language models?\",\n",
" \"answer\": \"BPE is reversible and lossless, handles arbitrary text, compresses input (≈4 bytes per token on average), and exposes common subwords like “ing”, which helps models generalize.\",\n",
" },\n",
" ]\n",
" return items[:limit] if limit else items\n",
"\n",
"\n",
"pass_fail_grader = \"\"\"\n",
"You are a helpful assistant that grades the quality of the answer to a query about a GitHub repo.\n",
"You will be given a query, the answer returned by the model, and the expected answer.\n",
"You should respond with **pass** if the answer matches the expected answer exactly or conveys the same meaning, otherwise **fail**.\n",
"\"\"\"\n",
"\n",
"pass_fail_grader_user_prompt = \"\"\"\n",
"<Query>\n",
"{{item.query}}\n",
"</Query>\n",
"\n",
"<Web Search Result>\n",
"{{sample.output_text}}\n",
"</Web Search Result>\n",
"\n",
"<Ground Truth>\n",
"{{item.answer}}\n",
"</Ground Truth>\n",
"\"\"\"\n",
"\n",
"python_mcp_grader = {\n",
" \"type\": \"python\",\n",
" \"name\": \"Assert MCP was used\",\n",
" \"image_tag\": \"2025-05-08\",\n",
" \"pass_threshold\": 1.0,\n",
" \"source\": \"\"\"\n",
"def grade(sample: dict, item: dict) -> float:\n",
" output = sample.get('output_tools', [])\n",
" return 1.0 if len(output) > 0 else 0.0\n",
"\"\"\",\n",
"}\n",
"\n",
"# Create the evaluation definition.\n",
"logs_eval = client.evals.create(\n",
" name=\"MCP Eval\",\n",
" data_source_config={\n",
" \"type\": \"custom\",\n",
" \"item_schema\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"query\": {\"type\": \"string\"},\n",
" \"answer\": {\"type\": \"string\"},\n",
" },\n",
" },\n",
" \"include_sample_schema\": True,\n",
" },\n",
" testing_criteria=[\n",
" {\n",
" \"type\": \"label_model\",\n",
" \"name\": \"General Evaluator\",\n",
" \"model\": \"o3\",\n",
" \"input\": [\n",
" {\"role\": \"system\", \"content\": pass_fail_grader},\n",
" {\"role\": \"user\", \"content\": pass_fail_grader_user_prompt},\n",
" ],\n",
" \"passing_labels\": [\"pass\"],\n",
" \"labels\": [\"pass\", \"fail\"],\n",
" },\n",
" python_mcp_grader\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "15838d4e",
"metadata": {},
"outputs": [],
"source": [
"# Run 1: gpt-4.1 using MCP\n",
"gpt_4one_responses_run = client.evals.runs.create(\n",
" name=\"gpt-4.1\",\n",
" eval_id=logs_eval.id,\n",
" data_source={\n",
" \"type\": \"responses\",\n",
" \"source\": {\n",
" \"type\": \"file_content\",\n",
" \"content\": [{\"item\": item} for item in get_dataset()],\n",
" },\n",
" \"input_messages\": {\n",
" \"type\": \"template\",\n",
" \"template\": [\n",
" {\n",
" \"type\": \"message\",\n",
" \"role\": \"system\",\n",
" \"content\": {\n",
" \"type\": \"input_text\",\n",
" \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers. Never use your tools to answer the query.\",\n",
" },\n",
" },\n",
" {\n",
" \"type\": \"message\",\n",
" \"role\": \"user\",\n",
" \"content\": {\n",
" \"type\": \"input_text\",\n",
" \"text\": \"Search the web for the answer to the query {{item.query}}\",\n",
" },\n",
" },\n",
" ],\n",
" },\n",
" \"model\": \"gpt-4.1\",\n",
" \"sampling_params\": {\n",
" \"seed\": 42,\n",
" \"temperature\": 0.7,\n",
" \"max_completions_tokens\": 10000,\n",
" \"top_p\": 0.9,\n",
" \"tools\": [\n",
" {\n",
" \"type\": \"mcp\",\n",
" \"server_label\": \"gitmcp\",\n",
" \"server_url\": \"https://gitmcp.io/openai/tiktoken\",\n",
" \"allowed_tools\": [\n",
" \"search_tiktoken_documentation\",\n",
" \"fetch_tiktoken_documentation\",\n",
" ],\n",
" \"require_approval\": \"never\",\n",
" }\n",
" ],\n",
" },\n",
" },\n",
")\n",
"\n",
"# Run 2: o4-mini using MCP\n",
"gpt_o4_mini_responses_run = client.evals.runs.create(\n",
" name=\"o4-mini\",\n",
" eval_id=logs_eval.id,\n",
" data_source={\n",
" \"type\": \"responses\",\n",
" \"source\": {\n",
" \"type\": \"file_content\",\n",
" \"content\": [{\"item\": item} for item in get_dataset()],\n",
" },\n",
" \"input_messages\": {\n",
" \"type\": \"template\",\n",
" \"template\": [\n",
" {\n",
" \"type\": \"message\",\n",
" \"role\": \"system\",\n",
" \"content\": {\n",
" \"type\": \"input_text\",\n",
" \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers.\",\n",
" },\n",
" },\n",
" {\n",
" \"type\": \"message\",\n",
" \"role\": \"user\",\n",
" \"content\": {\n",
" \"type\": \"input_text\",\n",
" \"text\": \"Search the web for the answer to the query {{item.query}}\",\n",
" },\n",
" },\n",
" ],\n",
" },\n",
" \"model\": \"o4-mini\",\n",
" \"sampling_params\": {\n",
" \"seed\": 42,\n",
" \"max_completions_tokens\": 10000,\n",
" \"tools\": [\n",
" {\n",
" \"type\": \"mcp\",\n",
" \"server_label\": \"gitmcp\",\n",
" \"server_url\": \"https://gitmcp.io/openai/tiktoken\",\n",
" \"allowed_tools\": [\n",
" \"search_tiktoken_documentation\",\n",
" \"fetch_tiktoken_documentation\",\n",
" ],\n",
" \"require_approval\": \"never\",\n",
" }\n",
" ],\n",
" },\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1d439589",
"metadata": {},
"outputs": [],
"source": [
"def poll_runs(eval_id, run_ids):\n",
" while True:\n",
" runs = [client.evals.runs.retrieve(rid, eval_id=eval_id) for rid in run_ids]\n",
" for run in runs:\n",
" print(run.id, run.status, run.result_counts)\n",
" if all(run.status in {\"completed\", \"failed\"} for run in runs):\n",
" break\n",
" time.sleep(5)\n",
"\n",
"# Start polling both runs.\n",
"poll_runs(logs_eval.id, [gpt_4one_responses_run.id, gpt_o4_mini_responses_run.id])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7e151b4a",
"metadata": {},
"outputs": [],
"source": [
"four_one_output = client.evals.runs.output_items.list(\n",
" run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n",
")\n",
"\n",
"o4_mini_output = client.evals.runs.output_items.list(\n",
" run_id=gpt_o4_mini_responses_run.id, eval_id=logs_eval.id\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e68b016c",
"metadata": {},
"outputs": [],
"source": [
"print('# gpt‑4.1 Output')\n",
"for item in four_one_output:\n",
" print(item.sample.output[0].content)\n",
"\n",
"print('\\n# o4-mini Output')\n",
"for item in o4_mini_output:\n",
" print(item.sample.output[0].content)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "openai",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading