diff --git a/examples/evaluation/use-cases/mcp_eval_notebook.ipynb b/examples/evaluation/use-cases/mcp_eval_notebook.ipynb
new file mode 100644
index 0000000000..59b7367a7e
--- /dev/null
+++ b/examples/evaluation/use-cases/mcp_eval_notebook.ipynb
@@ -0,0 +1,319 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fd71cc8e",
+   "metadata": {},
+   "source": [
+    "# Evaluating MCP-Based Answers with a Custom Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a565afbb",
+   "metadata": {},
+   "source": [
+    "This notebook evaluates a model's ability to answer questions about the **tiktoken** GitHub repository using the OpenAI **Evals** framework with a custom in-memory dataset. It compares two models (`gpt-4.1` and `o4-mini`) that leverage the **MCP** tool for repository‑aware searches."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "31fc4911",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "\n",
+    "import openai\n",
+    "\n",
+    "# Instantiate the OpenAI client (no custom base_url).\n",
+    "client = openai.OpenAI(\n",
+    "    api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "840a9f6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_dataset(limit=None):\n",
+    "    items = [\n",
+    "        {\n",
+    "            \"query\": \"What is tiktoken?\",\n",
+    "            \"answer\": \"tiktoken is a fast Byte-Pair Encoding (BPE) tokenizer designed for OpenAI models.\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"query\": \"How do I install the open-source version of tiktoken?\",\n",
+    "            \"answer\": \"Install it from PyPI with `pip install tiktoken`.\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"query\": \"How do I get the tokenizer for a specific OpenAI model?\",\n",
+    "            \"answer\": 'Call tiktoken.encoding_for_model(\"<model-name>\"), e.g. tiktoken.encoding_for_model(\"gpt-4o\").',\n",
+    "        },\n",
+    "        {\n",
+    "            \"query\": \"How does tiktoken perform compared to other tokenizers?\",\n",
+    "            \"answer\": \"On a 1 GB GPT-2 benchmark, tiktoken runs about 3-6x faster than GPT2TokenizerFast (tokenizers==0.13.2, transformers==4.24.0).\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"query\": \"Why is Byte-Pair Encoding (BPE) useful for language models?\",\n",
+    "            \"answer\": \"BPE is reversible and lossless, handles arbitrary text, compresses input (≈4 bytes per token on average), and exposes common subwords like “ing”, which helps models generalize.\",\n",
+    "        },\n",
+    "    ]\n",
+    "    return items[:limit] if limit else items\n",
+    "\n",
+    "\n",
+    "pass_fail_grader = \"\"\"\n",
+    "You are a helpful assistant that grades the quality of the answer to a query about a GitHub repo.\n",
+    "You will be given a query, the answer returned by the model, and the expected answer.\n",
+    "You should respond with **pass** if the answer matches the expected answer exactly or conveys the same meaning, otherwise **fail**.\n",
+    "\"\"\"\n",
+    "\n",
+    "pass_fail_grader_user_prompt = \"\"\"\n",
+    "<Query>\n",
+    "{{item.query}}\n",
+    "</Query>\n",
+    "\n",
+    "<Web Search Result>\n",
+    "{{sample.output_text}}\n",
+    "</Web Search Result>\n",
+    "\n",
+    "<Ground Truth>\n",
+    "{{item.answer}}\n",
+    "</Ground Truth>\n",
+    "\"\"\"\n",
+    "\n",
+    "python_mcp_grader = {\n",
+    "    \"type\": \"python\",\n",
+    "    \"name\": \"Assert MCP was used\",\n",
+    "    \"image_tag\": \"2025-05-08\",\n",
+    "    \"pass_threshold\": 1.0,\n",
+    "    \"source\": \"\"\"\n",
+    "def grade(sample: dict, item: dict) -> float:\n",
+    "    output = sample.get('output_tools', [])\n",
+    "    return 1.0 if len(output) > 0 else 0.0\n",
+    "\"\"\",\n",
+    "}\n",
+    "\n",
+    "# Create the evaluation definition.\n",
+    "logs_eval = client.evals.create(\n",
+    "    name=\"MCP Eval\",\n",
+    "    data_source_config={\n",
+    "        \"type\": \"custom\",\n",
+    "        \"item_schema\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"query\": {\"type\": \"string\"},\n",
+    "                \"answer\": {\"type\": \"string\"},\n",
+    "            },\n",
+    "        },\n",
+    "        \"include_sample_schema\": True,\n",
+    "    },\n",
+    "    testing_criteria=[\n",
+    "        {\n",
+    "            \"type\": \"label_model\",\n",
+    "            \"name\": \"General Evaluator\",\n",
+    "            \"model\": \"o3\",\n",
+    "            \"input\": [\n",
+    "                {\"role\": \"system\", \"content\": pass_fail_grader},\n",
+    "                {\"role\": \"user\", \"content\": pass_fail_grader_user_prompt},\n",
+    "            ],\n",
+    "            \"passing_labels\": [\"pass\"],\n",
+    "            \"labels\": [\"pass\", \"fail\"],\n",
+    "        },\n",
+    "        python_mcp_grader\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "15838d4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run 1: gpt-4.1 using MCP\n",
+    "gpt_4one_responses_run = client.evals.runs.create(\n",
+    "    name=\"gpt-4.1\",\n",
+    "    eval_id=logs_eval.id,\n",
+    "    data_source={\n",
+    "        \"type\": \"responses\",\n",
+    "        \"source\": {\n",
+    "            \"type\": \"file_content\",\n",
+    "            \"content\": [{\"item\": item} for item in get_dataset()],\n",
+    "        },\n",
+    "        \"input_messages\": {\n",
+    "            \"type\": \"template\",\n",
+    "            \"template\": [\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"system\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers. Never use your tools to answer the query.\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"Search the web for the answer to the query {{item.query}}\",\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        },\n",
+    "        \"model\": \"gpt-4.1\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"seed\": 42,\n",
+    "            \"temperature\": 0.7,\n",
+    "            \"max_completions_tokens\": 10000,\n",
+    "            \"top_p\": 0.9,\n",
+    "            \"tools\": [\n",
+    "                {\n",
+    "                    \"type\": \"mcp\",\n",
+    "                    \"server_label\": \"gitmcp\",\n",
+    "                    \"server_url\": \"https://gitmcp.io/openai/tiktoken\",\n",
+    "                    \"allowed_tools\": [\n",
+    "                        \"search_tiktoken_documentation\",\n",
+    "                        \"fetch_tiktoken_documentation\",\n",
+    "                    ],\n",
+    "                    \"require_approval\": \"never\",\n",
+    "                }\n",
+    "            ],\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "# Run 2: o4-mini using MCP\n",
+    "gpt_o4_mini_responses_run = client.evals.runs.create(\n",
+    "    name=\"o4-mini\",\n",
+    "    eval_id=logs_eval.id,\n",
+    "    data_source={\n",
+    "        \"type\": \"responses\",\n",
+    "        \"source\": {\n",
+    "            \"type\": \"file_content\",\n",
+    "            \"content\": [{\"item\": item} for item in get_dataset()],\n",
+    "        },\n",
+    "        \"input_messages\": {\n",
+    "            \"type\": \"template\",\n",
+    "            \"template\": [\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"system\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers.\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"Search the web for the answer to the query {{item.query}}\",\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        },\n",
+    "        \"model\": \"o4-mini\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"seed\": 42,\n",
+    "            \"max_completions_tokens\": 10000,\n",
+    "            \"tools\": [\n",
+    "                {\n",
+    "                    \"type\": \"mcp\",\n",
+    "                    \"server_label\": \"gitmcp\",\n",
+    "                    \"server_url\": \"https://gitmcp.io/openai/tiktoken\",\n",
+    "                    \"allowed_tools\": [\n",
+    "                        \"search_tiktoken_documentation\",\n",
+    "                        \"fetch_tiktoken_documentation\",\n",
+    "                    ],\n",
+    "                    \"require_approval\": \"never\",\n",
+    "                }\n",
+    "            ],\n",
+    "        },\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d439589",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def poll_runs(eval_id, run_ids):\n",
+    "    while True:\n",
+    "        runs = [client.evals.runs.retrieve(rid, eval_id=eval_id) for rid in run_ids]\n",
+    "        for run in runs:\n",
+    "            print(run.id, run.status, run.result_counts)\n",
+    "        if all(run.status in {\"completed\", \"failed\"} for run in runs):\n",
+    "            break\n",
+    "        time.sleep(5)\n",
+    "\n",
+    "# Start polling both runs.\n",
+    "poll_runs(logs_eval.id, [gpt_4one_responses_run.id, gpt_o4_mini_responses_run.id])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "7e151b4a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "four_one_output = client.evals.runs.output_items.list(\n",
+    "    run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n",
+    ")\n",
+    "\n",
+    "o4_mini_output = client.evals.runs.output_items.list(\n",
+    "    run_id=gpt_o4_mini_responses_run.id, eval_id=logs_eval.id\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e68b016c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('# gpt‑4.1 Output')\n",
+    "for item in four_one_output:\n",
+    "    print(item.sample.output[0].content)\n",
+    "\n",
+    "print('\\n# o4-mini Output')\n",
+    "for item in o4_mini_output:\n",
+    "    print(item.sample.output[0].content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "openai",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/evaluation/use-cases/structured-outputs-evaluation.ipynb b/examples/evaluation/use-cases/structured-outputs-evaluation.ipynb
new file mode 100644
index 0000000000..d255fe79aa
--- /dev/null
+++ b/examples/evaluation/use-cases/structured-outputs-evaluation.ipynb
@@ -0,0 +1,335 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9dd88e7c",
+   "metadata": {},
+   "source": [
+    "# Evaluating Code Quality Extraction with a Custom Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "64bf0667",
+   "metadata": {},
+   "source": [
+    "This notebook demonstrates how to evaluate a model's ability to extract symbols from code using the OpenAI **Evals** framework with a custom in-memory dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "eacc6ac7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "import openai\n",
+    "\n",
+    "client = openai.OpenAI(\n",
+    "    api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b272e193",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_dataset(limit=None):\n",
+    "    openai_sdk_file_path = os.path.dirname(openai.__file__)\n",
+    "\n",
+    "    file_paths = [\n",
+    "        os.path.join(openai_sdk_file_path, \"resources\", \"evals\", \"evals.py\"),\n",
+    "        os.path.join(openai_sdk_file_path, \"resources\", \"responses\", \"responses.py\"),\n",
+    "        os.path.join(openai_sdk_file_path, \"resources\", \"images.py\"),\n",
+    "        os.path.join(openai_sdk_file_path, \"resources\", \"embeddings.py\"),\n",
+    "        os.path.join(openai_sdk_file_path, \"resources\", \"files.py\"),\n",
+    "    ]\n",
+    "\n",
+    "    items = []\n",
+    "    for file_path in file_paths:\n",
+    "        items.append({\"input\": open(file_path, \"r\").read()})\n",
+    "    if limit:\n",
+    "        return items[:limit]\n",
+    "    return items\n",
+    "\n",
+    "\n",
+    "structured_output_grader = \"\"\"\n",
+    "You are a helpful assistant that grades the quality of extracted information from a code file.\n",
+    "You will be given a code file and a list of extracted information.\n",
+    "You should grade the quality of the extracted information.\n",
+    "\n",
+    "You should grade the quality on a scale of 1 to 7.\n",
+    "You should apply the following criteria, and calculate your score as follows:\n",
+    "You should first check for completeness on a scale of 1 to 7.\n",
+    "Then you should apply a quality modifier.\n",
+    "\n",
+    "The quality modifier is a multiplier from 0 to 1 that you multiply by the completeness score.\n",
+    "If there is 100% coverage for completion and it is all high quality, then you would return 7*1.\n",
+    "If there is 100% coverage for completion but it is all low quality, then you would return 7*0.5.\n",
+    "etc.\n",
+    "\"\"\"\n",
+    "\n",
+    "structured_output_grader_user_prompt = \"\"\"\n",
+    "<Code File>\n",
+    "{{item.input}}\n",
+    "</Code File>\n",
+    "\n",
+    "<Extracted Information>\n",
+    "{{sample.output_json.symbols}}\n",
+    "</Extracted Information>\n",
+    "\"\"\"\n",
+    "\n",
+    "logs_eval = client.evals.create(\n",
+    "    name=\"Code QA Eval\",\n",
+    "    data_source_config={\n",
+    "        \"type\": \"custom\",\n",
+    "        \"item_schema\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\"input\": {\"type\": \"string\"}},\n",
+    "        },\n",
+    "        \"include_sample_schema\": True,\n",
+    "    },\n",
+    "    testing_criteria=[\n",
+    "        {\n",
+    "            \"type\": \"score_model\",\n",
+    "            \"name\": \"General Evaluator\",\n",
+    "            \"model\": \"o3\",\n",
+    "            \"input\": [\n",
+    "                {\"role\": \"system\", \"content\": structured_output_grader},\n",
+    "                {\"role\": \"user\", \"content\": structured_output_grader_user_prompt},\n",
+    "            ],\n",
+    "            \"range\": [1, 7],\n",
+    "            \"pass_threshold\": 5.5,\n",
+    "        }\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "18f357e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpt_4one_completions_run = client.evals.runs.create(\n",
+    "    name=\"gpt-4.1\",\n",
+    "    eval_id=logs_eval.id,\n",
+    "    data_source={\n",
+    "        \"type\": \"completions\",\n",
+    "        \"source\": {\n",
+    "            \"type\": \"file_content\",\n",
+    "            \"content\": [{\"item\": item} for item in get_dataset(limit=1)],\n",
+    "        },\n",
+    "        \"input_messages\": {\n",
+    "            \"type\": \"template\",\n",
+    "            \"template\": [\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"system\",\n",
+    "                    \"content\": {\"type\": \"input_text\", \"text\": \"You are a helpful assistant.\"},\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"Extract the symbols from the code file {{item.input}}\",\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        },\n",
+    "        \"model\": \"gpt-4.1\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"seed\": 42,\n",
+    "            \"temperature\": 0.7,\n",
+    "            \"max_completions_tokens\": 10000,\n",
+    "            \"top_p\": 0.9,\n",
+    "            \"response_format\": {\n",
+    "                \"type\": \"json_schema\",\n",
+    "                \"json_schema\": {\n",
+    "                    \"name\": \"python_symbols\",\n",
+    "                    \"schema\": {\n",
+    "                        \"type\": \"object\",\n",
+    "                        \"properties\": {\n",
+    "                            \"symbols\": {\n",
+    "                                \"type\": \"array\",\n",
+    "                                \"description\": \"A list of symbols extracted from Python code.\",\n",
+    "                                \"items\": {\n",
+    "                                    \"type\": \"object\",\n",
+    "                                    \"properties\": {\n",
+    "                                        \"name\": {\"type\": \"string\", \"description\": \"The name of the symbol.\"},\n",
+    "                                        \"symbol_type\": {\n",
+    "                                            \"type\": \"string\", \"description\": \"The type of the symbol, e.g., variable, function, class.\",\n",
+    "                                        },\n",
+    "                                    },\n",
+    "                                    \"required\": [\"name\", \"symbol_type\"],\n",
+    "                                    \"additionalProperties\": False,\n",
+    "                                },\n",
+    "                            }\n",
+    "                        },\n",
+    "                        \"required\": [\"symbols\"],\n",
+    "                        \"additionalProperties\": False,\n",
+    "                    },\n",
+    "                    \"strict\": True,\n",
+    "                },\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "gpt_4one_responses_run = client.evals.runs.create(\n",
+    "    name=\"gpt-4.1-mini\",\n",
+    "    eval_id=logs_eval.id,\n",
+    "    data_source={\n",
+    "        \"type\": \"responses\",\n",
+    "        \"source\": {\n",
+    "            \"type\": \"file_content\",\n",
+    "            \"content\": [{\"item\": item} for item in get_dataset(limit=1)],\n",
+    "        },\n",
+    "        \"input_messages\": {\n",
+    "            \"type\": \"template\",\n",
+    "            \"template\": [\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"system\",\n",
+    "                    \"content\": {\"type\": \"input_text\", \"text\": \"You are a helpful assistant.\"},\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"Extract the symbols from the code file {{item.input}}\",\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        },\n",
+    "        \"model\": \"gpt-4.1-mini\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"seed\": 42,\n",
+    "            \"temperature\": 0.7,\n",
+    "            \"max_completions_tokens\": 10000,\n",
+    "            \"top_p\": 0.9,\n",
+    "            \"text\": {\n",
+    "                \"format\": {\n",
+    "                    \"type\": \"json_schema\",\n",
+    "                    \"name\": \"python_symbols\",\n",
+    "                    \"schema\": {\n",
+    "                        \"type\": \"object\",\n",
+    "                        \"properties\": {\n",
+    "                            \"symbols\": {\n",
+    "                                \"type\": \"array\",\n",
+    "                                \"description\": \"A list of symbols extracted from Python code.\",\n",
+    "                                \"items\": {\n",
+    "                                    \"type\": \"object\",\n",
+    "                                    \"properties\": {\n",
+    "                                        \"name\": {\"type\": \"string\", \"description\": \"The name of the symbol.\"},\n",
+    "                                        \"symbol_type\": {\n",
+    "                                            \"type\": \"string\",\n",
+    "                                            \"description\": \"The type of the symbol, e.g., variable, function, class.\",\n",
+    "                                        },\n",
+    "                                    },\n",
+    "                                    \"required\": [\"name\", \"symbol_type\"],\n",
+    "                                    \"additionalProperties\": False,\n",
+    "                                },\n",
+    "                            }\n",
+    "                        },\n",
+    "                        \"required\": [\"symbols\"],\n",
+    "                        \"additionalProperties\": False,\n",
+    "                    },\n",
+    "                    \"strict\": True,\n",
+    "                },\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cbc4f775",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def poll_runs(eval_id, run_ids):\n",
+    "    while True:\n",
+    "        runs = [client.evals.runs.retrieve(rid, eval_id=eval_id) for rid in run_ids]\n",
+    "        for run in runs:\n",
+    "            print(run.id, run.status, run.result_counts)\n",
+    "        if all(run.status in {\"completed\", \"failed\"} for run in runs):\n",
+    "            # dump results to file\n",
+    "            for run in runs:\n",
+    "                with open(f\"{run.id}.json\", \"w\") as f:\n",
+    "                    f.write(\n",
+    "                        client.evals.runs.output_items.list(\n",
+    "                            run_id=run.id, eval_id=eval_id\n",
+    "                        ).model_dump_json(indent=4)\n",
+    "                    )\n",
+    "            break\n",
+    "        time.sleep(5)\n",
+    "\n",
+    "poll_runs(logs_eval.id, [gpt_4one_completions_run.id, gpt_4one_responses_run.id])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "c316e6eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "completions_output = client.evals.runs.output_items.list(\n",
+    "    run_id=gpt_4one_completions_run.id, eval_id=logs_eval.id\n",
+    ")\n",
+    "\n",
+    "responses_output = client.evals.runs.output_items.list(\n",
+    "    run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f1b502e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('# Completions Output')\n",
+    "for item in completions_output:\n",
+    "    print(item)\n",
+    "\n",
+    "print('\\n# Responses Output')\n",
+    "for item in responses_output:\n",
+    "    print(item)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "openai",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/evaluation/use-cases/tools-evaluation.ipynb b/examples/evaluation/use-cases/tools-evaluation.ipynb
new file mode 100644
index 0000000000..cd5c72b52e
--- /dev/null
+++ b/examples/evaluation/use-cases/tools-evaluation.ipynb
@@ -0,0 +1,268 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluating Code Symbol Extraction Quality with a Custom Dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "This notebook demonstrates how to evaluate a model's ability to extract symbols from code files using the OpenAI **Evals** framework with a custom in-memory dataset."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "evalrun_68437e5370c481919a6874594ca177d9 queued ResultCounts(errored=0, failed=0, passed=0, total=0)\n",
+            "evalrun_68437e544fe881918f76dbd8dce3fd15 queued ResultCounts(errored=0, failed=0, passed=0, total=0)\n",
+            "evalrun_68437e5370c481919a6874594ca177d9 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n",
+            "evalrun_68437e544fe881918f76dbd8dce3fd15 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n",
+            "evalrun_68437e5370c481919a6874594ca177d9 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n",
+            "evalrun_68437e544fe881918f76dbd8dce3fd15 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n",
+            "evalrun_68437e5370c481919a6874594ca177d9 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n",
+            "evalrun_68437e544fe881918f76dbd8dce3fd15 completed ResultCounts(errored=0, failed=0, passed=1, total=1)\n",
+            "evalrun_68437e5370c481919a6874594ca177d9 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n",
+            "evalrun_68437e544fe881918f76dbd8dce3fd15 completed ResultCounts(errored=0, failed=0, passed=1, total=1)\n",
+            "evalrun_68437e5370c481919a6874594ca177d9 completed ResultCounts(errored=0, failed=1, passed=0, total=1)\n",
+            "evalrun_68437e544fe881918f76dbd8dce3fd15 completed ResultCounts(errored=0, failed=0, passed=1, total=1)\n"
+          ]
+        }
+      ],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "\n",
+        "import openai\n",
+        "\n",
+        "client = openai.OpenAI(\n",
+        "    api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"),\n",
+        ")\n",
+        "\n",
+        "\n",
+        "def get_dataset(limit=None):\n",
+        "    openai_sdk_file_path = os.path.dirname(openai.__file__)\n",
+        "\n",
+        "    file_paths = [\n",
+        "        os.path.join(openai_sdk_file_path, \"resources\", \"evals\", \"evals.py\"),\n",
+        "        os.path.join(openai_sdk_file_path, \"resources\", \"responses\", \"responses.py\"),\n",
+        "        os.path.join(openai_sdk_file_path, \"resources\", \"images.py\"),\n",
+        "        os.path.join(openai_sdk_file_path, \"resources\", \"embeddings.py\"),\n",
+        "        os.path.join(openai_sdk_file_path, \"resources\", \"files.py\"),\n",
+        "    ]\n",
+        "\n",
+        "    items = []\n",
+        "    for file_path in file_paths:\n",
+        "        items.append({\"input\": open(file_path, \"r\").read()})\n",
+        "    if limit:\n",
+        "        return items[:limit]\n",
+        "    return items\n",
+        "\n",
+        "\n",
+        "structured_output_grader = \"\"\"\n",
+        "You are a helpful assistant that grades the quality of extracted information from a code file.\n",
+        "You will be given a code file and a list of extracted information.\n",
+        "You should grade the quality of the extracted information.\n",
+        "\n",
+        "You should grade the quality on a scale of 1 to 7.\n",
+        "You should apply the following criteria, and calculate your score as follows:\n",
+        "You should first check for completeness on a scale of 1 to 7.\n",
+        "Then you should apply a quality modifier.\n",
+        "\n",
+        "The quality modifier is a multiplier from 0 to 1 that you multiply by the completeness score.\n",
+        "If there is 100% coverage for completion and it is all high quality, then you would return 7*1.\n",
+        "If there is 100% coverage for completion but it is all low quality, then you would return 7*0.5.\n",
+        "etc.\n",
+        "\"\"\"\n",
+        "\n",
+        "structured_output_grader_user_prompt = \"\"\"\n",
+        "<Code File>\n",
+        "{{item.input}}\n",
+        "</Code File>\n",
+        "\n",
+        "<Extracted Information>\n",
+        "{{sample.output_tools[0].function.arguments.symbols}}\n",
+        "</Extracted Information>\n",
+        "\"\"\"\n",
+        "\n",
+        "logs_eval = client.evals.create(\n",
+        "    name=\"Code QA Eval\",\n",
+        "    data_source_config={\n",
+        "        \"type\": \"custom\",\n",
+        "        \"item_schema\": {\"type\": \"object\", \"properties\": {\"input\": {\"type\": \"string\"}}},\n",
+        "        \"include_sample_schema\": True,\n",
+        "    },\n",
+        "    testing_criteria=[\n",
+        "        {\n",
+        "            \"type\": \"score_model\",\n",
+        "            \"name\": \"General Evaluator\",\n",
+        "            \"model\": \"o3\",\n",
+        "            \"input\": [\n",
+        "                {\"role\": \"system\", \"content\": structured_output_grader},\n",
+        "                {\"role\": \"user\", \"content\": structured_output_grader_user_prompt},\n",
+        "            ],\n",
+        "            \"range\": [1, 7],\n",
+        "            \"pass_threshold\": 5.5,\n",
+        "        }\n",
+        "    ],\n",
+        ")\n",
+        "\n",
+        "symbol_tool = {\n",
+        "    \"name\": \"extract_symbols\",\n",
+        "    \"description\": \"Extract the symbols from the code file\",\n",
+        "    \"parameters\": {\n",
+        "        \"type\": \"object\",\n",
+        "        \"properties\": {\n",
+        "            \"symbols\": {\n",
+        "                \"type\": \"array\",\n",
+        "                \"description\": \"A list of symbols extracted from Python code.\",\n",
+        "                \"items\": {\n",
+        "                    \"type\": \"object\",\n",
+        "                    \"properties\": {\n",
+        "                        \"name\": {\"type\": \"string\", \"description\": \"The name of the symbol.\"},\n",
+        "                        \"symbol_type\": {\"type\": \"string\", \"description\": \"The type of the symbol, e.g., variable, function, class.\"},\n",
+        "                    },\n",
+        "                    \"required\": [\"name\", \"symbol_type\"],\n",
+        "                    \"additionalProperties\": False,\n",
+        "                },\n",
+        "            }\n",
+        "        },\n",
+        "        \"required\": [\"symbols\"],\n",
+        "        \"additionalProperties\": False,\n",
+        "    },\n",
+        "}\n",
+        "\n",
+        "gpt_4one_completions_run = client.evals.runs.create(\n",
+        "    name=\"gpt-4.1\",\n",
+        "    eval_id=logs_eval.id,\n",
+        "    data_source={\n",
+        "        \"type\": \"completions\",\n",
+        "        \"source\": {\"type\": \"file_content\", \"content\": [{\"item\": item} for item in get_dataset(limit=1)]},\n",
+        "        \"input_messages\": {\n",
+        "            \"type\": \"template\",\n",
+        "            \"template\": [\n",
+        "                {\"type\": \"message\", \"role\": \"system\", \"content\": {\"type\": \"input_text\", \"text\": \"You are a helpful assistant.\"}},\n",
+        "                {\"type\": \"message\", \"role\": \"user\", \"content\": {\"type\": \"input_text\", \"text\": \"Extract the symbols from the code file {{item.input}}\"}},\n",
+        "            ],\n",
+        "        },\n",
+        "        \"model\": \"gpt-4.1\",\n",
+        "        \"sampling_params\": {\n",
+        "            \"seed\": 42,\n",
+        "            \"temperature\": 0.7,\n",
+        "            \"max_completions_tokens\": 10000,\n",
+        "            \"top_p\": 0.9,\n",
+        "            \"tools\": [{\"type\": \"function\", \"function\": symbol_tool}],\n",
+        "        },\n",
+        "    },\n",
+        ")\n",
+        "\n",
+        "gpt_4one_responses_run = client.evals.runs.create(\n",
+        "    name=\"gpt-4.1\",\n",
+        "    eval_id=logs_eval.id,\n",
+        "    data_source={\n",
+        "        \"type\": \"responses\",\n",
+        "        \"source\": {\"type\": \"file_content\", \"content\": [{\"item\": item} for item in get_dataset(limit=1)]},\n",
+        "        \"input_messages\": {\n",
+        "            \"type\": \"template\",\n",
+        "            \"template\": [\n",
+        "                {\"type\": \"message\", \"role\": \"system\", \"content\": {\"type\": \"input_text\", \"text\": \"You are a helpful assistant.\"}},\n",
+        "                {\"type\": \"message\", \"role\": \"user\", \"content\": {\"type\": \"input_text\", \"text\": \"Extract the symbols from the code file {{item.input}}\"}},\n",
+        "            ],\n",
+        "        },\n",
+        "        \"model\": \"gpt-4.1\",\n",
+        "        \"sampling_params\": {\n",
+        "            \"seed\": 42,\n",
+        "            \"temperature\": 0.7,\n",
+        "            \"max_completions_tokens\": 10000,\n",
+        "            \"top_p\": 0.9,\n",
+        "            \"tools\": [{\"type\": \"function\", **symbol_tool}],\n",
+        "        },\n",
+        "    },\n",
+        ")\n",
+        "\n",
+        "\n",
+        "def poll_runs(eval_id, run_ids):\n",
+        "    # poll both runs at the same time, until they are complete or failed\n",
+        "    while True:\n",
+        "        runs = [client.evals.runs.retrieve(run_id, eval_id=eval_id) for run_id in run_ids]\n",
+        "        for run in runs:\n",
+        "            print(run.id, run.status, run.result_counts)\n",
+        "        if all(run.status in (\"completed\", \"failed\") for run in runs):\n",
+        "            break\n",
+        "        time.sleep(5)\n",
+        "\n",
+        "\n",
+        "poll_runs(logs_eval.id, [gpt_4one_completions_run.id, gpt_4one_responses_run.id])\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "completions_output = client.evals.runs.output_items.list(\n",
+        "    run_id=gpt_4one_completions_run.id, eval_id=logs_eval.id\n",
+        ")\n",
+        "\n",
+        "responses_output = client.evals.runs.output_items.list(\n",
+        "    run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n",
+        ")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[{'name': 'Evals', 'symbol_type': 'class'}, {'name': 'AsyncEvals', 'symbol_type': 'class'}, {'name': 'EvalsWithRawResponse', 'symbol_type': 'class'}, {'name': 'AsyncEvalsWithRawResponse', 'symbol_type': 'class'}, {'name': 'EvalsWithStreamingResponse', 'symbol_type': 'class'}, {'name': 'AsyncEvalsWithStreamingResponse', 'symbol_type': 'class'}, {'name': '__all__', 'symbol_type': 'variable'}, {'name': 'Evals.runs', 'symbol_type': 'function'}, {'name': 'Evals.with_raw_response', 'symbol_type': 'function'}, {'name': 'Evals.with_streaming_response', 'symbol_type': 'function'}, {'name': 'Evals.create', 'symbol_type': 'function'}, {'name': 'Evals.retrieve', 'symbol_type': 'function'}, {'name': 'Evals.update', 'symbol_type': 'function'}, {'name': 'Evals.list', 'symbol_type': 'function'}, {'name': 'Evals.delete', 'symbol_type': 'function'}, {'name': 'AsyncEvals.runs', 'symbol_type': 'function'}, {'name': 'AsyncEvals.with_raw_response', 'symbol_type': 'function'}, {'name': 'AsyncEvals.with_streaming_response', 'symbol_type': 'function'}, {'name': 'AsyncEvals.create', 'symbol_type': 'function'}, {'name': 'AsyncEvals.retrieve', 'symbol_type': 'function'}, {'name': 'AsyncEvals.update', 'symbol_type': 'function'}, {'name': 'AsyncEvals.list', 'symbol_type': 'function'}, {'name': 'AsyncEvals.delete', 'symbol_type': 'function'}, {'name': 'EvalsWithRawResponse.__init__', 'symbol_type': 'function'}, {'name': 'EvalsWithRawResponse.runs', 'symbol_type': 'function'}, {'name': 'AsyncEvalsWithRawResponse.__init__', 'symbol_type': 'function'}, {'name': 'AsyncEvalsWithRawResponse.runs', 'symbol_type': 'function'}, {'name': 'EvalsWithStreamingResponse.__init__', 'symbol_type': 'function'}, {'name': 'EvalsWithStreamingResponse.runs', 'symbol_type': 'function'}, {'name': 'AsyncEvalsWithStreamingResponse.__init__', 'symbol_type': 'function'}, {'name': 'AsyncEvalsWithStreamingResponse.runs', 'symbol_type': 'function'}]\n",
+            "[{'name': 'Evals', 'symbol_type': 'class'}, {'name': 'AsyncEvals', 'symbol_type': 'class'}, {'name': 'EvalsWithRawResponse', 'symbol_type': 'class'}, {'name': 'AsyncEvalsWithRawResponse', 'symbol_type': 'class'}, {'name': 'EvalsWithStreamingResponse', 'symbol_type': 'class'}, {'name': 'AsyncEvalsWithStreamingResponse', 'symbol_type': 'class'}, {'name': '__all__', 'symbol_type': 'variable'}]\n"
+          ]
+        }
+      ],
+      "source": [
+        "import json\n",
+        "\n",
+        "for item in completions_output:\n",
+        "    print(json.loads(item.sample.output[0].tool_calls[0][\"function\"][\"arguments\"])[\"symbols\"])\n",
+        "\n",
+        "for item in responses_output:\n",
+        "    print(json.loads(item.sample.output[0].tool_calls[0][\"function\"][\"arguments\"])[\"symbols\"])\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "openai",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.9"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
diff --git a/examples/evaluation/use-cases/web-search-evaluation.ipynb b/examples/evaluation/use-cases/web-search-evaluation.ipynb
new file mode 100644
index 0000000000..91f9dbb5f3
--- /dev/null
+++ b/examples/evaluation/use-cases/web-search-evaluation.ipynb
@@ -0,0 +1,197 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluating Web Search Quality with a Custom Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook demonstrates how to evaluate a model's ability to retrieve correct answers from the web using the OpenAI **Evals** framework with a custom in-memory dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "\n",
+    "import openai\n",
+    "\n",
+    "client = openai.OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"))\n",
+    "\n",
+    "\n",
+    "def get_dataset(limit=None):\n",
+    "    return [\n",
+    "        {\n",
+    "            \"query\": \"coolest person in the world, the 100m dash at the 2008 olympics was the best sports event of all time\",\n",
+    "            \"answer\": \"usain bolt\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"query\": \"best library in the world, there is nothing better than a dataframe\",\n",
+    "            \"answer\": \"pandas\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"query\": \"most fun place to visit, I am obsessed with the Philbrook Museum of Art\",\n",
+    "            \"answer\": \"tulsa, oklahoma\",\n",
+    "        },\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "pass_fail_grader = \"\"\"\n",
+    "You are a helpful assistant that grades the quality of a web search.\n",
+    "You will be given a query and an answer.\n",
+    "You should grade the quality of the web search.\n",
+    "\n",
+    "You should either say \"pass\" or \"fail\", if the query contains the answer.\n",
+    "\n",
+    "\"\"\"\n",
+    "\n",
+    "pass_fail_grader_user_prompt = \"\"\"\n",
+    "<Query>\n",
+    "{{item.query}}\n",
+    "</Query>\n",
+    "\n",
+    "<Web Search Result>\n",
+    "{{sample.output_text}}\n",
+    "</Web Search Result>\n",
+    "\n",
+    "<Ground Truth>\n",
+    "{{item.answer}}\n",
+    "</Ground Truth>\n",
+    "\"\"\"\n",
+    "\n",
+    "logs_eval = client.evals.create(\n",
+    "    name=\"Web Search Eval\",\n",
+    "    data_source_config={\n",
+    "        \"type\": \"custom\",\n",
+    "        \"item_schema\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"query\": {\"type\": \"string\"},\n",
+    "                \"answer\": {\"type\": \"string\"},\n",
+    "            },\n",
+    "        },\n",
+    "        \"include_sample_schema\": True,\n",
+    "    },\n",
+    "    testing_criteria=[\n",
+    "        {\n",
+    "            \"type\": \"label_model\",\n",
+    "            \"name\": \"Web Search Evaluator\",\n",
+    "            \"model\": \"o3\",\n",
+    "            \"input\": [\n",
+    "                {\n",
+    "                    \"role\": \"system\",\n",
+    "                    \"content\": pass_fail_grader,\n",
+    "                },\n",
+    "                {\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": pass_fail_grader_user_prompt,\n",
+    "                },\n",
+    "            ],\n",
+    "            \"passing_labels\": [\"pass\"],\n",
+    "            \"labels\": [\"pass\", \"fail\"],\n",
+    "        }\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "gpt_4one_responses_run = client.evals.runs.create(\n",
+    "    name=\"gpt-4.1\",\n",
+    "    eval_id=logs_eval.id,\n",
+    "    data_source={\n",
+    "        \"type\": \"responses\",\n",
+    "        \"source\": {\n",
+    "            \"type\": \"file_content\",\n",
+    "            \"content\": [{\"item\": item} for item in get_dataset()],\n",
+    "        },\n",
+    "        \"input_messages\": {\n",
+    "            \"type\": \"template\",\n",
+    "            \"template\": [\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"system\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers.\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"message\",\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"input_text\",\n",
+    "                        \"text\": \"Search the web for the answer to the query {{item.query}}\",\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        },\n",
+    "        \"model\": \"gpt-4.1\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"seed\": 42,\n",
+    "            \"temperature\": 0.7,\n",
+    "            \"max_completions_tokens\": 10000,\n",
+    "            \"top_p\": 0.9,\n",
+    "            \"tools\": [{\"type\": \"web_search_preview\"}],\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def poll_runs(eval_id, run_ids):\n",
+    "    # poll both runs at the same time, until they are complete or failed\n",
+    "    while True:\n",
+    "        runs = [client.evals.runs.retrieve(run_id, eval_id=eval_id) for run_id in run_ids]\n",
+    "        for run in runs:\n",
+    "            print(run.id, run.status, run.result_counts)\n",
+    "        if all(run.status == \"completed\" or run.status == \"failed\" for run in runs):\n",
+    "            break\n",
+    "        time.sleep(5)\n",
+    "\n",
+    "\n",
+    "poll_runs(logs_eval.id, [gpt_4one_responses_run.id])\n",
+    "\n",
+    "four_one = client.evals.runs.output_items.list(\n",
+    "    run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for item in four_one:\n",
+    "    print(item.sample.output[0].content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "openai",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/registry.yaml b/registry.yaml
index d84d06caf0..f1160e9391 100644
--- a/registry.yaml
+++ b/registry.yaml
@@ -2129,3 +2129,44 @@
     - evals
     - stripe
     - conversion
+
+- title: Evals API Use-case - MCP Evaluation
+  path: examples/evaluation/use-cases/mcp_eval_notebook.ipynb
+  date: 2025-06-09
+  authors:
+    - josiah-openai
+  tags:
+    - evals-api
+    - responses
+    - evals
+    - mcp
+
+- title: Evals API Use-case - Structured Outputs Evaluation
+  path: examples/evaluation/use-cases/structured-outputs-evaluation.ipynb
+  date: 2025-06-09
+  authors:
+    - josiah-openai
+  tags:
+    - evals-api
+    - responses
+    - evals
+
+- title: Evals API Use-case - Tools Evaluation
+  path: examples/evaluation/use-cases/tools-evaluation.ipynb
+  date: 2025-06-09
+  authors:
+    - josiah-openai
+  tags:
+    - evals-api
+    - responses
+    - evals
+
+- title: Evals API Use-case - Web Search Evaluation
+  path: examples/evaluation/use-cases/web-search-evaluation.ipynb
+  date: 2025-06-09
+  authors:
+    - josiah-openai
+  tags:
+    - evals-api
+    - responses
+    - evals