openai · kwhinnery-openai · Jun 9, 2025 · Jun 7, 2025 · Jun 9, 2025
diff --git a/examples/evaluation/use-cases/mcp_eval_notebook.ipynb b/examples/evaluation/use-cases/mcp_eval_notebook.ipynb
@@ -0,0 +1,319 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fd71cc8e",
+   "metadata": {},
+   "source": [
+    "# Evaluating MCP-Based Answers with a Custom Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a565afbb",
+   "metadata": {},
+   "source": [
+    "This notebook evaluates a model's ability to answer questions about the **tiktoken** GitHub repository using the OpenAI **Evals** framework with a custom in-memory dataset. It compares two models (`gpt-4.1` and `o4-mini`) that leverage the **MCP** tool for repository‑aware searches."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "31fc4911",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "\n",
+    "import openai\n",
+    "\n",
+    "# Instantiate the OpenAI client (no custom base_url).\n",
+    "client = openai.OpenAI(\n",
+    "    api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "840a9f6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_dataset(limit=None):\n",
+    "    items = [\n",
+    "        {\n",
+    "            \"query\": \"What is tiktoken?\",\n",
+    "            \"answer\": \"tiktoken is a fast Byte-Pair Encoding (BPE) tokenizer designed for OpenAI models.\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"query\": \"How do I install the open-source version of tiktoken?\",\n",
+    "            \"answer\": \"Install it from PyPI with `pip install tiktoken`.\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"query\": \"How do I get the tokenizer for a specific OpenAI model?\",\n",
+    "            \"answer\": 'Call tiktoken.encoding_for_model(\"<model-name>\"), e.g. tiktoken.encoding_for_model(\"gpt-4o\").',\n",
+    "        },\n",
+    "        {\n",
+    "            \"query\": \"How does tiktoken perform compared to other tokenizers?\",\n",
+    "            \"answer\": \"On a 1 GB GPT-2 benchmark, tiktoken runs about 3-6x faster than GPT2TokenizerFast (tokenizers==0.13.2, transformers==4.24.0).\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"query\": \"Why is Byte-Pair Encoding (BPE) useful for language models?\",\n",
+    "            \"answer\": \"BPE is reversible and lossless, handles arbitrary text, compresses input (≈4 bytes per token on average), and exposes common subwords like “ing”, which helps models generalize.\",\n",
+    "        },\n",
+    "    ]\n",
+    "    return items[:limit] if limit else items\n",
+    "\n",
+    "\n",
+    "pass_fail_grader = \"\"\"\n",
+    "You are a helpful assistant that grades the quality of the answer to a query about a GitHub repo.\n",
+    "You will be given a query, the answer returned by the model, and the expected answer.\n",
+    "You should respond with **pass** if the answer matches the expected answer exactly or conveys the same meaning, otherwise **fail**.\n",
+    "\"\"\"\n",
+    "\n",
+    "pass_fail_grader_user_prompt = \"\"\"\n",
+    "<Query>\n",
+    "{{item.query}}\n",
+    "</Query>\n",
+    "\n",
+    "<Web Search Result>\n",
+    "{{sample.output_text}}\n",
+    "</Web Search Result>\n",
+    "\n",
+    "<Ground Truth>\n",
+    "{{item.answer}}\n",
+    "</Ground Truth>\n",
+    "\"\"\"\n",
+    "\n",
+    "python_mcp_grader = {\n",
+    "    \"type\": \"python\",\n",
+    "    \"name\": \"Assert MCP was used\",\n",
+    "    \"image_tag\": \"2025-05-08\",\n",
+    "    \"pass_threshold\": 1.0,\n",
+    "    \"source\": \"\"\"\n",
+    "def grade(sample: dict, item: dict) -> float:\n",
+    "    output = sample.get('output_tools', [])\n",
+    "    return 1.0 if len(output) > 0 else 0.0\n",
+    "\"\"\",\n",
+    "}\n",
+    "\n",
+    "# Create the evaluation definition.\n",
+    "logs_eval = client.evals.create(\n",
+    "    name=\"MCP Eval\",\n",
+    "    data_source_config={\n",
+    "        \"type\": \"custom\",\n",
+    "        \"item_schema\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"query\": {\"type\": \"string\"},\n",
+    "                \"answer\": {\"type\": \"string\"},\n",
+    "            },\n",
+    "        },\n",
+    "        \"include_sample_schema\": True,\n",
+    "    },\n",
+    "    testing_criteria=[\n",
+    "        {\n",
+    "            \"type\": \"label_model\",\n",
+    "            \"name\": \"General Evaluator\",\n",
+    "            \"model\": \"o3\",\n",
+    "            \"input\": [\n",
+    "                {\"role\": \"system\", \"content\": pass_fail_grader},\n",
+    "                {\"role\": \"user\", \"content\": pass_fail_grader_user_prompt},\n",
+    "            ],\n",
+    "            \"passing_labels\": [\"pass\"],\n",
+    "            \"labels\": [\"pass\", \"fail\"],\n",
+    "        },\n",
+    "        python_mcp_grader\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "15838d4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run 1: gpt-4.1 using MCP\n",
+    "gpt_4one_responses_run = client.evals.runs.create(\n",
+    "    name=\"gpt-4.1\",\n",
+    "    eval_id=logs_eval.id,\n",
+    "    data_source={\n",
+    "        \"type\": \"responses\",\n",
+    "        \"source\": {\n",
+    "            \"type\": \"file_content\",\n",
+    "            \"content\": [{\"item\": item} for item in get_dataset()],\n",
+    "        },\n",
+    "        \"input_messages\": {\n",
+    "            \"type\": \"template\",\n",
+    "            \"template\": [\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"system\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers. Never use your tools to answer the query.\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"Search the web for the answer to the query {{item.query}}\",\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        },\n",
+    "        \"model\": \"gpt-4.1\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"seed\": 42,\n",
+    "            \"temperature\": 0.7,\n",
+    "            \"max_completions_tokens\": 10000,\n",
+    "            \"top_p\": 0.9,\n",
+    "            \"tools\": [\n",
+    "                {\n",
+    "                    \"type\": \"mcp\",\n",
+    "                    \"server_label\": \"gitmcp\",\n",
+    "                    \"server_url\": \"https://gitmcp.io/openai/tiktoken\",\n",
+    "                    \"allowed_tools\": [\n",
+    "                        \"search_tiktoken_documentation\",\n",
+    "                        \"fetch_tiktoken_documentation\",\n",
+    "                    ],\n",
+    "                    \"require_approval\": \"never\",\n",
+    "                }\n",
+    "            ],\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "# Run 2: o4-mini using MCP\n",
+    "gpt_o4_mini_responses_run = client.evals.runs.create(\n",
+    "    name=\"o4-mini\",\n",
+    "    eval_id=logs_eval.id,\n",
+    "    data_source={\n",
+    "        \"type\": \"responses\",\n",
+    "        \"source\": {\n",
+    "            \"type\": \"file_content\",\n",
+    "            \"content\": [{\"item\": item} for item in get_dataset()],\n",
+    "        },\n",
+    "        \"input_messages\": {\n",
+    "            \"type\": \"template\",\n",
+    "            \"template\": [\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"system\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers.\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"Search the web for the answer to the query {{item.query}}\",\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        },\n",
+    "        \"model\": \"o4-mini\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"seed\": 42,\n",
+    "            \"max_completions_tokens\": 10000,\n",
+    "            \"tools\": [\n",
+    "                {\n",
+    "                    \"type\": \"mcp\",\n",
+    "                    \"server_label\": \"gitmcp\",\n",
+    "                    \"server_url\": \"https://gitmcp.io/openai/tiktoken\",\n",
+    "                    \"allowed_tools\": [\n",
+    "                        \"search_tiktoken_documentation\",\n",
+    "                        \"fetch_tiktoken_documentation\",\n",
+    "                    ],\n",
+    "                    \"require_approval\": \"never\",\n",
+    "                }\n",
+    "            ],\n",
+    "        },\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d439589",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def poll_runs(eval_id, run_ids):\n",
+    "    while True:\n",
+    "        runs = [client.evals.runs.retrieve(rid, eval_id=eval_id) for rid in run_ids]\n",
+    "        for run in runs:\n",
+    "            print(run.id, run.status, run.result_counts)\n",
+    "        if all(run.status in {\"completed\", \"failed\"} for run in runs):\n",
+    "            break\n",
+    "        time.sleep(5)\n",
+    "\n",
+    "# Start polling both runs.\n",
+    "poll_runs(logs_eval.id, [gpt_4one_responses_run.id, gpt_o4_mini_responses_run.id])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "7e151b4a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "four_one_output = client.evals.runs.output_items.list(\n",
+    "    run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n",
+    ")\n",
+    "\n",
+    "o4_mini_output = client.evals.runs.output_items.list(\n",
+    "    run_id=gpt_o4_mini_responses_run.id, eval_id=logs_eval.id\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e68b016c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('# gpt‑4.1 Output')\n",
+    "for item in four_one_output:\n",
+    "    print(item.sample.output[0].content)\n",
+    "\n",
+    "print('\\n# o4-mini Output')\n",
+    "for item in o4_mini_output:\n",
+    "    print(item.sample.output[0].content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "openai",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}