diff --git a/examples/evaluation/use-cases/mcp_eval_notebook.ipynb b/examples/evaluation/use-cases/mcp_eval_notebook.ipynb new file mode 100644 index 0000000000..59b7367a7e --- /dev/null +++ b/examples/evaluation/use-cases/mcp_eval_notebook.ipynb @@ -0,0 +1,319 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fd71cc8e", + "metadata": {}, + "source": [ + "# Evaluating MCP-Based Answers with a Custom Dataset" + ] + }, + { + "cell_type": "markdown", + "id": "a565afbb", + "metadata": {}, + "source": [ + "This notebook evaluates a model's ability to answer questions about the **tiktoken** GitHub repository using the OpenAI **Evals** framework with a custom in-memory dataset. It compares two models (`gpt-4.1` and `o4-mini`) that leverage the **MCP** tool for repository‑aware searches." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "31fc4911", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "\n", + "import openai\n", + "\n", + "# Instantiate the OpenAI client (no custom base_url).\n", + "client = openai.OpenAI(\n", + " api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "840a9f6d", + "metadata": {}, + "outputs": [], + "source": [ + "def get_dataset(limit=None):\n", + " items = [\n", + " {\n", + " \"query\": \"What is tiktoken?\",\n", + " \"answer\": \"tiktoken is a fast Byte-Pair Encoding (BPE) tokenizer designed for OpenAI models.\",\n", + " },\n", + " {\n", + " \"query\": \"How do I install the open-source version of tiktoken?\",\n", + " \"answer\": \"Install it from PyPI with `pip install tiktoken`.\",\n", + " },\n", + " {\n", + " \"query\": \"How do I get the tokenizer for a specific OpenAI model?\",\n", + " \"answer\": 'Call tiktoken.encoding_for_model(\"\"), e.g. tiktoken.encoding_for_model(\"gpt-4o\").',\n", + " },\n", + " {\n", + " \"query\": \"How does tiktoken perform compared to other tokenizers?\",\n", + " \"answer\": \"On a 1 GB GPT-2 benchmark, tiktoken runs about 3-6x faster than GPT2TokenizerFast (tokenizers==0.13.2, transformers==4.24.0).\",\n", + " },\n", + " {\n", + " \"query\": \"Why is Byte-Pair Encoding (BPE) useful for language models?\",\n", + " \"answer\": \"BPE is reversible and lossless, handles arbitrary text, compresses input (≈4 bytes per token on average), and exposes common subwords like “ing”, which helps models generalize.\",\n", + " },\n", + " ]\n", + " return items[:limit] if limit else items\n", + "\n", + "\n", + "pass_fail_grader = \"\"\"\n", + "You are a helpful assistant that grades the quality of the answer to a query about a GitHub repo.\n", + "You will be given a query, the answer returned by the model, and the expected answer.\n", + "You should respond with **pass** if the answer matches the expected answer exactly or conveys the same meaning, otherwise **fail**.\n", + "\"\"\"\n", + "\n", + "pass_fail_grader_user_prompt = \"\"\"\n", + "\n", + "{{item.query}}\n", + "\n", + "\n", + "\n", + "{{sample.output_text}}\n", + "\n", + "\n", + "\n", + "{{item.answer}}\n", + "\n", + "\"\"\"\n", + "\n", + "python_mcp_grader = {\n", + " \"type\": \"python\",\n", + " \"name\": \"Assert MCP was used\",\n", + " \"image_tag\": \"2025-05-08\",\n", + " \"pass_threshold\": 1.0,\n", + " \"source\": \"\"\"\n", + "def grade(sample: dict, item: dict) -> float:\n", + " output = sample.get('output_tools', [])\n", + " return 1.0 if len(output) > 0 else 0.0\n", + "\"\"\",\n", + "}\n", + "\n", + "# Create the evaluation definition.\n", + "logs_eval = client.evals.create(\n", + " name=\"MCP Eval\",\n", + " data_source_config={\n", + " \"type\": \"custom\",\n", + " \"item_schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"query\": {\"type\": \"string\"},\n", + " \"answer\": {\"type\": \"string\"},\n", + " },\n", + " },\n", + " \"include_sample_schema\": True,\n", + " },\n", + " testing_criteria=[\n", + " {\n", + " \"type\": \"label_model\",\n", + " \"name\": \"General Evaluator\",\n", + " \"model\": \"o3\",\n", + " \"input\": [\n", + " {\"role\": \"system\", \"content\": pass_fail_grader},\n", + " {\"role\": \"user\", \"content\": pass_fail_grader_user_prompt},\n", + " ],\n", + " \"passing_labels\": [\"pass\"],\n", + " \"labels\": [\"pass\", \"fail\"],\n", + " },\n", + " python_mcp_grader\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "15838d4e", + "metadata": {}, + "outputs": [], + "source": [ + "# Run 1: gpt-4.1 using MCP\n", + "gpt_4one_responses_run = client.evals.runs.create(\n", + " name=\"gpt-4.1\",\n", + " eval_id=logs_eval.id,\n", + " data_source={\n", + " \"type\": \"responses\",\n", + " \"source\": {\n", + " \"type\": \"file_content\",\n", + " \"content\": [{\"item\": item} for item in get_dataset()],\n", + " },\n", + " \"input_messages\": {\n", + " \"type\": \"template\",\n", + " \"template\": [\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"system\",\n", + " \"content\": {\n", + " \"type\": \"input_text\",\n", + " \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers. Never use your tools to answer the query.\",\n", + " },\n", + " },\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"user\",\n", + " \"content\": {\n", + " \"type\": \"input_text\",\n", + " \"text\": \"Search the web for the answer to the query {{item.query}}\",\n", + " },\n", + " },\n", + " ],\n", + " },\n", + " \"model\": \"gpt-4.1\",\n", + " \"sampling_params\": {\n", + " \"seed\": 42,\n", + " \"temperature\": 0.7,\n", + " \"max_completions_tokens\": 10000,\n", + " \"top_p\": 0.9,\n", + " \"tools\": [\n", + " {\n", + " \"type\": \"mcp\",\n", + " \"server_label\": \"gitmcp\",\n", + " \"server_url\": \"https://gitmcp.io/openai/tiktoken\",\n", + " \"allowed_tools\": [\n", + " \"search_tiktoken_documentation\",\n", + " \"fetch_tiktoken_documentation\",\n", + " ],\n", + " \"require_approval\": \"never\",\n", + " }\n", + " ],\n", + " },\n", + " },\n", + ")\n", + "\n", + "# Run 2: o4-mini using MCP\n", + "gpt_o4_mini_responses_run = client.evals.runs.create(\n", + " name=\"o4-mini\",\n", + " eval_id=logs_eval.id,\n", + " data_source={\n", + " \"type\": \"responses\",\n", + " \"source\": {\n", + " \"type\": \"file_content\",\n", + " \"content\": [{\"item\": item} for item in get_dataset()],\n", + " },\n", + " \"input_messages\": {\n", + " \"type\": \"template\",\n", + " \"template\": [\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"system\",\n", + " \"content\": {\n", + " \"type\": \"input_text\",\n", + " \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers.\",\n", + " },\n", + " },\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"user\",\n", + " \"content\": {\n", + " \"type\": \"input_text\",\n", + " \"text\": \"Search the web for the answer to the query {{item.query}}\",\n", + " },\n", + " },\n", + " ],\n", + " },\n", + " \"model\": \"o4-mini\",\n", + " \"sampling_params\": {\n", + " \"seed\": 42,\n", + " \"max_completions_tokens\": 10000,\n", + " \"tools\": [\n", + " {\n", + " \"type\": \"mcp\",\n", + " \"server_label\": \"gitmcp\",\n", + " \"server_url\": \"https://gitmcp.io/openai/tiktoken\",\n", + " \"allowed_tools\": [\n", + " \"search_tiktoken_documentation\",\n", + " \"fetch_tiktoken_documentation\",\n", + " ],\n", + " \"require_approval\": \"never\",\n", + " }\n", + " ],\n", + " },\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d439589", + "metadata": {}, + "outputs": [], + "source": [ + "def poll_runs(eval_id, run_ids):\n", + " while True:\n", + " runs = [client.evals.runs.retrieve(rid, eval_id=eval_id) for rid in run_ids]\n", + " for run in runs:\n", + " print(run.id, run.status, run.result_counts)\n", + " if all(run.status in {\"completed\", \"failed\"} for run in runs):\n", + " break\n", + " time.sleep(5)\n", + "\n", + "# Start polling both runs.\n", + "poll_runs(logs_eval.id, [gpt_4one_responses_run.id, gpt_o4_mini_responses_run.id])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7e151b4a", + "metadata": {}, + "outputs": [], + "source": [ + "four_one_output = client.evals.runs.output_items.list(\n", + " run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n", + ")\n", + "\n", + "o4_mini_output = client.evals.runs.output_items.list(\n", + " run_id=gpt_o4_mini_responses_run.id, eval_id=logs_eval.id\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e68b016c", + "metadata": {}, + "outputs": [], + "source": [ + "print('# gpt‑4.1 Output')\n", + "for item in four_one_output:\n", + " print(item.sample.output[0].content)\n", + "\n", + "print('\\n# o4-mini Output')\n", + "for item in o4_mini_output:\n", + " print(item.sample.output[0].content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "openai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/evaluation/use-cases/structured-outputs-evaluation.ipynb b/examples/evaluation/use-cases/structured-outputs-evaluation.ipynb new file mode 100644 index 0000000000..d255fe79aa --- /dev/null +++ b/examples/evaluation/use-cases/structured-outputs-evaluation.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9dd88e7c", + "metadata": {}, + "source": [ + "# Evaluating Code Quality Extraction with a Custom Dataset" + ] + }, + { + "cell_type": "markdown", + "id": "64bf0667", + "metadata": {}, + "source": [ + "This notebook demonstrates how to evaluate a model's ability to extract symbols from code using the OpenAI **Evals** framework with a custom in-memory dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "eacc6ac7", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "import openai\n", + "\n", + "client = openai.OpenAI(\n", + " api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b272e193", + "metadata": {}, + "outputs": [], + "source": [ + "def get_dataset(limit=None):\n", + " openai_sdk_file_path = os.path.dirname(openai.__file__)\n", + "\n", + " file_paths = [\n", + " os.path.join(openai_sdk_file_path, \"resources\", \"evals\", \"evals.py\"),\n", + " os.path.join(openai_sdk_file_path, \"resources\", \"responses\", \"responses.py\"),\n", + " os.path.join(openai_sdk_file_path, \"resources\", \"images.py\"),\n", + " os.path.join(openai_sdk_file_path, \"resources\", \"embeddings.py\"),\n", + " os.path.join(openai_sdk_file_path, \"resources\", \"files.py\"),\n", + " ]\n", + "\n", + " items = []\n", + " for file_path in file_paths:\n", + " items.append({\"input\": open(file_path, \"r\").read()})\n", + " if limit:\n", + " return items[:limit]\n", + " return items\n", + "\n", + "\n", + "structured_output_grader = \"\"\"\n", + "You are a helpful assistant that grades the quality of extracted information from a code file.\n", + "You will be given a code file and a list of extracted information.\n", + "You should grade the quality of the extracted information.\n", + "\n", + "You should grade the quality on a scale of 1 to 7.\n", + "You should apply the following criteria, and calculate your score as follows:\n", + "You should first check for completeness on a scale of 1 to 7.\n", + "Then you should apply a quality modifier.\n", + "\n", + "The quality modifier is a multiplier from 0 to 1 that you multiply by the completeness score.\n", + "If there is 100% coverage for completion and it is all high quality, then you would return 7*1.\n", + "If there is 100% coverage for completion but it is all low quality, then you would return 7*0.5.\n", + "etc.\n", + "\"\"\"\n", + "\n", + "structured_output_grader_user_prompt = \"\"\"\n", + "\n", + "{{item.input}}\n", + "\n", + "\n", + "\n", + "{{sample.output_json.symbols}}\n", + "\n", + "\"\"\"\n", + "\n", + "logs_eval = client.evals.create(\n", + " name=\"Code QA Eval\",\n", + " data_source_config={\n", + " \"type\": \"custom\",\n", + " \"item_schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\"input\": {\"type\": \"string\"}},\n", + " },\n", + " \"include_sample_schema\": True,\n", + " },\n", + " testing_criteria=[\n", + " {\n", + " \"type\": \"score_model\",\n", + " \"name\": \"General Evaluator\",\n", + " \"model\": \"o3\",\n", + " \"input\": [\n", + " {\"role\": \"system\", \"content\": structured_output_grader},\n", + " {\"role\": \"user\", \"content\": structured_output_grader_user_prompt},\n", + " ],\n", + " \"range\": [1, 7],\n", + " \"pass_threshold\": 5.5,\n", + " }\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "18f357e6", + "metadata": {}, + "outputs": [], + "source": [ + "gpt_4one_completions_run = client.evals.runs.create(\n", + " name=\"gpt-4.1\",\n", + " eval_id=logs_eval.id,\n", + " data_source={\n", + " \"type\": \"completions\",\n", + " \"source\": {\n", + " \"type\": \"file_content\",\n", + " \"content\": [{\"item\": item} for item in get_dataset(limit=1)],\n", + " },\n", + " \"input_messages\": {\n", + " \"type\": \"template\",\n", + " \"template\": [\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"system\",\n", + " \"content\": {\"type\": \"input_text\", \"text\": \"You are a helpful assistant.\"},\n", + " },\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"user\",\n", + " \"content\": {\n", + " \"type\": \"input_text\",\n", + " \"text\": \"Extract the symbols from the code file {{item.input}}\",\n", + " },\n", + " },\n", + " ],\n", + " },\n", + " \"model\": \"gpt-4.1\",\n", + " \"sampling_params\": {\n", + " \"seed\": 42,\n", + " \"temperature\": 0.7,\n", + " \"max_completions_tokens\": 10000,\n", + " \"top_p\": 0.9,\n", + " \"response_format\": {\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": {\n", + " \"name\": \"python_symbols\",\n", + " \"schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"symbols\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"A list of symbols extracted from Python code.\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\", \"description\": \"The name of the symbol.\"},\n", + " \"symbol_type\": {\n", + " \"type\": \"string\", \"description\": \"The type of the symbol, e.g., variable, function, class.\",\n", + " },\n", + " },\n", + " \"required\": [\"name\", \"symbol_type\"],\n", + " \"additionalProperties\": False,\n", + " },\n", + " }\n", + " },\n", + " \"required\": [\"symbols\"],\n", + " \"additionalProperties\": False,\n", + " },\n", + " \"strict\": True,\n", + " },\n", + " },\n", + " },\n", + " },\n", + ")\n", + "\n", + "gpt_4one_responses_run = client.evals.runs.create(\n", + " name=\"gpt-4.1-mini\",\n", + " eval_id=logs_eval.id,\n", + " data_source={\n", + " \"type\": \"responses\",\n", + " \"source\": {\n", + " \"type\": \"file_content\",\n", + " \"content\": [{\"item\": item} for item in get_dataset(limit=1)],\n", + " },\n", + " \"input_messages\": {\n", + " \"type\": \"template\",\n", + " \"template\": [\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"system\",\n", + " \"content\": {\"type\": \"input_text\", \"text\": \"You are a helpful assistant.\"},\n", + " },\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"user\",\n", + " \"content\": {\n", + " \"type\": \"input_text\",\n", + " \"text\": \"Extract the symbols from the code file {{item.input}}\",\n", + " },\n", + " },\n", + " ],\n", + " },\n", + " \"model\": \"gpt-4.1-mini\",\n", + " \"sampling_params\": {\n", + " \"seed\": 42,\n", + " \"temperature\": 0.7,\n", + " \"max_completions_tokens\": 10000,\n", + " \"top_p\": 0.9,\n", + " \"text\": {\n", + " \"format\": {\n", + " \"type\": \"json_schema\",\n", + " \"name\": \"python_symbols\",\n", + " \"schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"symbols\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"A list of symbols extracted from Python code.\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\", \"description\": \"The name of the symbol.\"},\n", + " \"symbol_type\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The type of the symbol, e.g., variable, function, class.\",\n", + " },\n", + " },\n", + " \"required\": [\"name\", \"symbol_type\"],\n", + " \"additionalProperties\": False,\n", + " },\n", + " }\n", + " },\n", + " \"required\": [\"symbols\"],\n", + " \"additionalProperties\": False,\n", + " },\n", + " \"strict\": True,\n", + " },\n", + " },\n", + " },\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbc4f775", + "metadata": {}, + "outputs": [], + "source": [ + "def poll_runs(eval_id, run_ids):\n", + " while True:\n", + " runs = [client.evals.runs.retrieve(rid, eval_id=eval_id) for rid in run_ids]\n", + " for run in runs:\n", + " print(run.id, run.status, run.result_counts)\n", + " if all(run.status in {\"completed\", \"failed\"} for run in runs):\n", + " # dump results to file\n", + " for run in runs:\n", + " with open(f\"{run.id}.json\", \"w\") as f:\n", + " f.write(\n", + " client.evals.runs.output_items.list(\n", + " run_id=run.id, eval_id=eval_id\n", + " ).model_dump_json(indent=4)\n", + " )\n", + " break\n", + " time.sleep(5)\n", + "\n", + "poll_runs(logs_eval.id, [gpt_4one_completions_run.id, gpt_4one_responses_run.id])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c316e6eb", + "metadata": {}, + "outputs": [], + "source": [ + "completions_output = client.evals.runs.output_items.list(\n", + " run_id=gpt_4one_completions_run.id, eval_id=logs_eval.id\n", + ")\n", + "\n", + "responses_output = client.evals.runs.output_items.list(\n", + " run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f1b502e", + "metadata": {}, + "outputs": [], + "source": [ + "print('# Completions Output')\n", + "for item in completions_output:\n", + " print(item)\n", + "\n", + "print('\\n# Responses Output')\n", + "for item in responses_output:\n", + " print(item)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "openai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/evaluation/use-cases/tools-evaluation.ipynb b/examples/evaluation/use-cases/tools-evaluation.ipynb new file mode 100644 index 0000000000..cd5c72b52e --- /dev/null +++ b/examples/evaluation/use-cases/tools-evaluation.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluating Code Symbol Extraction Quality with a Custom Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates how to evaluate a model's ability to extract symbols from code files using the OpenAI **Evals** framework with a custom in-memory dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evalrun_68437e5370c481919a6874594ca177d9 queued ResultCounts(errored=0, failed=0, passed=0, total=0)\n", + "evalrun_68437e544fe881918f76dbd8dce3fd15 queued ResultCounts(errored=0, failed=0, passed=0, total=0)\n", + "evalrun_68437e5370c481919a6874594ca177d9 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n", + "evalrun_68437e544fe881918f76dbd8dce3fd15 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n", + "evalrun_68437e5370c481919a6874594ca177d9 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n", + "evalrun_68437e544fe881918f76dbd8dce3fd15 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n", + "evalrun_68437e5370c481919a6874594ca177d9 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n", + "evalrun_68437e544fe881918f76dbd8dce3fd15 completed ResultCounts(errored=0, failed=0, passed=1, total=1)\n", + "evalrun_68437e5370c481919a6874594ca177d9 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)\n", + "evalrun_68437e544fe881918f76dbd8dce3fd15 completed ResultCounts(errored=0, failed=0, passed=1, total=1)\n", + "evalrun_68437e5370c481919a6874594ca177d9 completed ResultCounts(errored=0, failed=1, passed=0, total=1)\n", + "evalrun_68437e544fe881918f76dbd8dce3fd15 completed ResultCounts(errored=0, failed=0, passed=1, total=1)\n" + ] + } + ], + "source": [ + "import os\n", + "import time\n", + "\n", + "import openai\n", + "\n", + "client = openai.OpenAI(\n", + " api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"),\n", + ")\n", + "\n", + "\n", + "def get_dataset(limit=None):\n", + " openai_sdk_file_path = os.path.dirname(openai.__file__)\n", + "\n", + " file_paths = [\n", + " os.path.join(openai_sdk_file_path, \"resources\", \"evals\", \"evals.py\"),\n", + " os.path.join(openai_sdk_file_path, \"resources\", \"responses\", \"responses.py\"),\n", + " os.path.join(openai_sdk_file_path, \"resources\", \"images.py\"),\n", + " os.path.join(openai_sdk_file_path, \"resources\", \"embeddings.py\"),\n", + " os.path.join(openai_sdk_file_path, \"resources\", \"files.py\"),\n", + " ]\n", + "\n", + " items = []\n", + " for file_path in file_paths:\n", + " items.append({\"input\": open(file_path, \"r\").read()})\n", + " if limit:\n", + " return items[:limit]\n", + " return items\n", + "\n", + "\n", + "structured_output_grader = \"\"\"\n", + "You are a helpful assistant that grades the quality of extracted information from a code file.\n", + "You will be given a code file and a list of extracted information.\n", + "You should grade the quality of the extracted information.\n", + "\n", + "You should grade the quality on a scale of 1 to 7.\n", + "You should apply the following criteria, and calculate your score as follows:\n", + "You should first check for completeness on a scale of 1 to 7.\n", + "Then you should apply a quality modifier.\n", + "\n", + "The quality modifier is a multiplier from 0 to 1 that you multiply by the completeness score.\n", + "If there is 100% coverage for completion and it is all high quality, then you would return 7*1.\n", + "If there is 100% coverage for completion but it is all low quality, then you would return 7*0.5.\n", + "etc.\n", + "\"\"\"\n", + "\n", + "structured_output_grader_user_prompt = \"\"\"\n", + "\n", + "{{item.input}}\n", + "\n", + "\n", + "\n", + "{{sample.output_tools[0].function.arguments.symbols}}\n", + "\n", + "\"\"\"\n", + "\n", + "logs_eval = client.evals.create(\n", + " name=\"Code QA Eval\",\n", + " data_source_config={\n", + " \"type\": \"custom\",\n", + " \"item_schema\": {\"type\": \"object\", \"properties\": {\"input\": {\"type\": \"string\"}}},\n", + " \"include_sample_schema\": True,\n", + " },\n", + " testing_criteria=[\n", + " {\n", + " \"type\": \"score_model\",\n", + " \"name\": \"General Evaluator\",\n", + " \"model\": \"o3\",\n", + " \"input\": [\n", + " {\"role\": \"system\", \"content\": structured_output_grader},\n", + " {\"role\": \"user\", \"content\": structured_output_grader_user_prompt},\n", + " ],\n", + " \"range\": [1, 7],\n", + " \"pass_threshold\": 5.5,\n", + " }\n", + " ],\n", + ")\n", + "\n", + "symbol_tool = {\n", + " \"name\": \"extract_symbols\",\n", + " \"description\": \"Extract the symbols from the code file\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"symbols\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"A list of symbols extracted from Python code.\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\", \"description\": \"The name of the symbol.\"},\n", + " \"symbol_type\": {\"type\": \"string\", \"description\": \"The type of the symbol, e.g., variable, function, class.\"},\n", + " },\n", + " \"required\": [\"name\", \"symbol_type\"],\n", + " \"additionalProperties\": False,\n", + " },\n", + " }\n", + " },\n", + " \"required\": [\"symbols\"],\n", + " \"additionalProperties\": False,\n", + " },\n", + "}\n", + "\n", + "gpt_4one_completions_run = client.evals.runs.create(\n", + " name=\"gpt-4.1\",\n", + " eval_id=logs_eval.id,\n", + " data_source={\n", + " \"type\": \"completions\",\n", + " \"source\": {\"type\": \"file_content\", \"content\": [{\"item\": item} for item in get_dataset(limit=1)]},\n", + " \"input_messages\": {\n", + " \"type\": \"template\",\n", + " \"template\": [\n", + " {\"type\": \"message\", \"role\": \"system\", \"content\": {\"type\": \"input_text\", \"text\": \"You are a helpful assistant.\"}},\n", + " {\"type\": \"message\", \"role\": \"user\", \"content\": {\"type\": \"input_text\", \"text\": \"Extract the symbols from the code file {{item.input}}\"}},\n", + " ],\n", + " },\n", + " \"model\": \"gpt-4.1\",\n", + " \"sampling_params\": {\n", + " \"seed\": 42,\n", + " \"temperature\": 0.7,\n", + " \"max_completions_tokens\": 10000,\n", + " \"top_p\": 0.9,\n", + " \"tools\": [{\"type\": \"function\", \"function\": symbol_tool}],\n", + " },\n", + " },\n", + ")\n", + "\n", + "gpt_4one_responses_run = client.evals.runs.create(\n", + " name=\"gpt-4.1\",\n", + " eval_id=logs_eval.id,\n", + " data_source={\n", + " \"type\": \"responses\",\n", + " \"source\": {\"type\": \"file_content\", \"content\": [{\"item\": item} for item in get_dataset(limit=1)]},\n", + " \"input_messages\": {\n", + " \"type\": \"template\",\n", + " \"template\": [\n", + " {\"type\": \"message\", \"role\": \"system\", \"content\": {\"type\": \"input_text\", \"text\": \"You are a helpful assistant.\"}},\n", + " {\"type\": \"message\", \"role\": \"user\", \"content\": {\"type\": \"input_text\", \"text\": \"Extract the symbols from the code file {{item.input}}\"}},\n", + " ],\n", + " },\n", + " \"model\": \"gpt-4.1\",\n", + " \"sampling_params\": {\n", + " \"seed\": 42,\n", + " \"temperature\": 0.7,\n", + " \"max_completions_tokens\": 10000,\n", + " \"top_p\": 0.9,\n", + " \"tools\": [{\"type\": \"function\", **symbol_tool}],\n", + " },\n", + " },\n", + ")\n", + "\n", + "\n", + "def poll_runs(eval_id, run_ids):\n", + " # poll both runs at the same time, until they are complete or failed\n", + " while True:\n", + " runs = [client.evals.runs.retrieve(run_id, eval_id=eval_id) for run_id in run_ids]\n", + " for run in runs:\n", + " print(run.id, run.status, run.result_counts)\n", + " if all(run.status in (\"completed\", \"failed\") for run in runs):\n", + " break\n", + " time.sleep(5)\n", + "\n", + "\n", + "poll_runs(logs_eval.id, [gpt_4one_completions_run.id, gpt_4one_responses_run.id])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "completions_output = client.evals.runs.output_items.list(\n", + " run_id=gpt_4one_completions_run.id, eval_id=logs_eval.id\n", + ")\n", + "\n", + "responses_output = client.evals.runs.output_items.list(\n", + " run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'name': 'Evals', 'symbol_type': 'class'}, {'name': 'AsyncEvals', 'symbol_type': 'class'}, {'name': 'EvalsWithRawResponse', 'symbol_type': 'class'}, {'name': 'AsyncEvalsWithRawResponse', 'symbol_type': 'class'}, {'name': 'EvalsWithStreamingResponse', 'symbol_type': 'class'}, {'name': 'AsyncEvalsWithStreamingResponse', 'symbol_type': 'class'}, {'name': '__all__', 'symbol_type': 'variable'}, {'name': 'Evals.runs', 'symbol_type': 'function'}, {'name': 'Evals.with_raw_response', 'symbol_type': 'function'}, {'name': 'Evals.with_streaming_response', 'symbol_type': 'function'}, {'name': 'Evals.create', 'symbol_type': 'function'}, {'name': 'Evals.retrieve', 'symbol_type': 'function'}, {'name': 'Evals.update', 'symbol_type': 'function'}, {'name': 'Evals.list', 'symbol_type': 'function'}, {'name': 'Evals.delete', 'symbol_type': 'function'}, {'name': 'AsyncEvals.runs', 'symbol_type': 'function'}, {'name': 'AsyncEvals.with_raw_response', 'symbol_type': 'function'}, {'name': 'AsyncEvals.with_streaming_response', 'symbol_type': 'function'}, {'name': 'AsyncEvals.create', 'symbol_type': 'function'}, {'name': 'AsyncEvals.retrieve', 'symbol_type': 'function'}, {'name': 'AsyncEvals.update', 'symbol_type': 'function'}, {'name': 'AsyncEvals.list', 'symbol_type': 'function'}, {'name': 'AsyncEvals.delete', 'symbol_type': 'function'}, {'name': 'EvalsWithRawResponse.__init__', 'symbol_type': 'function'}, {'name': 'EvalsWithRawResponse.runs', 'symbol_type': 'function'}, {'name': 'AsyncEvalsWithRawResponse.__init__', 'symbol_type': 'function'}, {'name': 'AsyncEvalsWithRawResponse.runs', 'symbol_type': 'function'}, {'name': 'EvalsWithStreamingResponse.__init__', 'symbol_type': 'function'}, {'name': 'EvalsWithStreamingResponse.runs', 'symbol_type': 'function'}, {'name': 'AsyncEvalsWithStreamingResponse.__init__', 'symbol_type': 'function'}, {'name': 'AsyncEvalsWithStreamingResponse.runs', 'symbol_type': 'function'}]\n", + "[{'name': 'Evals', 'symbol_type': 'class'}, {'name': 'AsyncEvals', 'symbol_type': 'class'}, {'name': 'EvalsWithRawResponse', 'symbol_type': 'class'}, {'name': 'AsyncEvalsWithRawResponse', 'symbol_type': 'class'}, {'name': 'EvalsWithStreamingResponse', 'symbol_type': 'class'}, {'name': 'AsyncEvalsWithStreamingResponse', 'symbol_type': 'class'}, {'name': '__all__', 'symbol_type': 'variable'}]\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "for item in completions_output:\n", + " print(json.loads(item.sample.output[0].tool_calls[0][\"function\"][\"arguments\"])[\"symbols\"])\n", + "\n", + "for item in responses_output:\n", + " print(json.loads(item.sample.output[0].tool_calls[0][\"function\"][\"arguments\"])[\"symbols\"])\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "openai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/evaluation/use-cases/web-search-evaluation.ipynb b/examples/evaluation/use-cases/web-search-evaluation.ipynb new file mode 100644 index 0000000000..91f9dbb5f3 --- /dev/null +++ b/examples/evaluation/use-cases/web-search-evaluation.ipynb @@ -0,0 +1,197 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluating Web Search Quality with a Custom Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates how to evaluate a model's ability to retrieve correct answers from the web using the OpenAI **Evals** framework with a custom in-memory dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "\n", + "import openai\n", + "\n", + "client = openai.OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"_OPENAI_API_KEY\"))\n", + "\n", + "\n", + "def get_dataset(limit=None):\n", + " return [\n", + " {\n", + " \"query\": \"coolest person in the world, the 100m dash at the 2008 olympics was the best sports event of all time\",\n", + " \"answer\": \"usain bolt\",\n", + " },\n", + " {\n", + " \"query\": \"best library in the world, there is nothing better than a dataframe\",\n", + " \"answer\": \"pandas\",\n", + " },\n", + " {\n", + " \"query\": \"most fun place to visit, I am obsessed with the Philbrook Museum of Art\",\n", + " \"answer\": \"tulsa, oklahoma\",\n", + " },\n", + " ]\n", + "\n", + "\n", + "pass_fail_grader = \"\"\"\n", + "You are a helpful assistant that grades the quality of a web search.\n", + "You will be given a query and an answer.\n", + "You should grade the quality of the web search.\n", + "\n", + "You should either say \"pass\" or \"fail\", if the query contains the answer.\n", + "\n", + "\"\"\"\n", + "\n", + "pass_fail_grader_user_prompt = \"\"\"\n", + "\n", + "{{item.query}}\n", + "\n", + "\n", + "\n", + "{{sample.output_text}}\n", + "\n", + "\n", + "\n", + "{{item.answer}}\n", + "\n", + "\"\"\"\n", + "\n", + "logs_eval = client.evals.create(\n", + " name=\"Web Search Eval\",\n", + " data_source_config={\n", + " \"type\": \"custom\",\n", + " \"item_schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"query\": {\"type\": \"string\"},\n", + " \"answer\": {\"type\": \"string\"},\n", + " },\n", + " },\n", + " \"include_sample_schema\": True,\n", + " },\n", + " testing_criteria=[\n", + " {\n", + " \"type\": \"label_model\",\n", + " \"name\": \"Web Search Evaluator\",\n", + " \"model\": \"o3\",\n", + " \"input\": [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": pass_fail_grader,\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": pass_fail_grader_user_prompt,\n", + " },\n", + " ],\n", + " \"passing_labels\": [\"pass\"],\n", + " \"labels\": [\"pass\", \"fail\"],\n", + " }\n", + " ],\n", + ")\n", + "\n", + "gpt_4one_responses_run = client.evals.runs.create(\n", + " name=\"gpt-4.1\",\n", + " eval_id=logs_eval.id,\n", + " data_source={\n", + " \"type\": \"responses\",\n", + " \"source\": {\n", + " \"type\": \"file_content\",\n", + " \"content\": [{\"item\": item} for item in get_dataset()],\n", + " },\n", + " \"input_messages\": {\n", + " \"type\": \"template\",\n", + " \"template\": [\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"system\",\n", + " \"content\": {\n", + " \"type\": \"input_text\",\n", + " \"text\": \"You are a helpful assistant that searches the web and gives contextually relevant answers.\",\n", + " },\n", + " },\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"user\",\n", + " \"content\": {\n", + " \"type\": \"input_text\",\n", + " \"text\": \"Search the web for the answer to the query {{item.query}}\",\n", + " },\n", + " },\n", + " ],\n", + " },\n", + " \"model\": \"gpt-4.1\",\n", + " \"sampling_params\": {\n", + " \"seed\": 42,\n", + " \"temperature\": 0.7,\n", + " \"max_completions_tokens\": 10000,\n", + " \"top_p\": 0.9,\n", + " \"tools\": [{\"type\": \"web_search_preview\"}],\n", + " },\n", + " },\n", + ")\n", + "\n", + "\n", + "def poll_runs(eval_id, run_ids):\n", + " # poll both runs at the same time, until they are complete or failed\n", + " while True:\n", + " runs = [client.evals.runs.retrieve(run_id, eval_id=eval_id) for run_id in run_ids]\n", + " for run in runs:\n", + " print(run.id, run.status, run.result_counts)\n", + " if all(run.status == \"completed\" or run.status == \"failed\" for run in runs):\n", + " break\n", + " time.sleep(5)\n", + "\n", + "\n", + "poll_runs(logs_eval.id, [gpt_4one_responses_run.id])\n", + "\n", + "four_one = client.evals.runs.output_items.list(\n", + " run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for item in four_one:\n", + " print(item.sample.output[0].content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "openai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/registry.yaml b/registry.yaml index d84d06caf0..f1160e9391 100644 --- a/registry.yaml +++ b/registry.yaml @@ -2129,3 +2129,44 @@ - evals - stripe - conversion + +- title: Evals API Use-case - MCP Evaluation + path: examples/evaluation/use-cases/mcp_eval_notebook.ipynb + date: 2025-06-09 + authors: + - josiah-openai + tags: + - evals-api + - responses + - evals + - mcp + +- title: Evals API Use-case - Structured Outputs Evaluation + path: examples/evaluation/use-cases/structured-outputs-evaluation.ipynb + date: 2025-06-09 + authors: + - josiah-openai + tags: + - evals-api + - responses + - evals + +- title: Evals API Use-case - Tools Evaluation + path: examples/evaluation/use-cases/tools-evaluation.ipynb + date: 2025-06-09 + authors: + - josiah-openai + tags: + - evals-api + - responses + - evals + +- title: Evals API Use-case - Web Search Evaluation + path: examples/evaluation/use-cases/web-search-evaluation.ipynb + date: 2025-06-09 + authors: + - josiah-openai + tags: + - evals-api + - responses + - evals