diff --git a/examples/partners/model_selection_guide/agent_utils.py b/examples/partners/model_selection_guide/agent_utils.py new file mode 100644 index 0000000000..9b06baf4e3 --- /dev/null +++ b/examples/partners/model_selection_guide/agent_utils.py @@ -0,0 +1,225 @@ +from __future__ import annotations +import json, time, uuid, logging, re +from dataclasses import dataclass, asdict, field +from pathlib import Path +from typing import Any, Dict, List +from openai import OpenAI + +# --- tool back‑ends ------------------------- +from tools import chem_lookup, cost_estimator, outcome_db, literature_search, list_available_chemicals + +# ---------- tiny infrastructure helpers -------------------------------------- + +# Holds run-specific parameters provided by user. +@dataclass +class Context: + compound: str + goal: str + budget: float + time_h: int + previous: str + client: OpenAI + run_id: str = field(default_factory=lambda: uuid.uuid4().hex[:8]) + + def prompt_vars(self): + return { + "compound": self.compound, + "goal": self.goal, + "budget": self.budget, + "time_h": self.time_h, + "previous": self.previous, + } + +# -- Function‑calling tool manifest -------------------- + +def load_tools(): + return [ + { + "type": "function", + "function": { + "name": "chem_lookup", + "description": "Mock function to look up chemical properties.", + "parameters": { + "type": "object", + "properties": { + "chemical_name": { + "type": "string", + "description": "The name of the chemical to look up." + }, + "property": { + "type": "string", + "description": "Optional specific property to retrieve (e.g., 'melting_point'). If None, returns all properties." + } + }, + "required": ["chemical_name"] + } + } + }, + { + "type": "function", + "function": { + "name": "cost_estimator", + "description": "Mock function to estimate the cost of reagents and procedures.", + "parameters": { + "type": "object", + "properties": { + "reagents": { + "type": "array", + "description": "List of reagents, where each reagent is a dictionary with 'name', 'amount', and 'unit'.", + "items": { + "type": "object", + "properties": { + "name": {"type": "string", "description": "Name of the reagent."}, + "amount": {"type": "number", "description": "Amount of the reagent."}, + "unit": {"type": "string", "description": "Unit for the amount (e.g., 'g', 'mg', 'kg')."} + }, + "required": ["name", "amount", "unit"] + } + }, + "equipment": { + "type": "array", + "description": "Optional list of equipment items used.", + "items": {"type": "string"} + }, + "duration_hours": { + "type": "number", + "description": "Optional duration of the procedure in hours for labor cost calculation." + } + }, + } + } + }, + { + "type": "function", + "function": { + "name": "outcome_db", + "description": "Mock function to query the database of past experiment outcomes.", + "parameters": { + "type": "object", + "properties": { + "compound": { + "type": "string", + "description": "The chemical compound name to query past experiments for." + }, + "parameter": { + "type": "string", + "description": "Optional specific parameter to filter experiments by (e.g., 'yield', 'temperature')." + }, + "limit": { + "type": "integer", + "description": "Maximum number of experiment results to return (default: 5)." + } + }, + "required": ["compound"] + } + } + }, + { + "type": "function", + "function": { + "name": "literature_search", + "description": "Mock function to search scientific literature for relevant information.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query (keywords) for the literature search." + }, + "filter": { + "type": "string", + "description": "Optional filter string, potentially including year (e.g., '2023') or journal name." + }, + "limit": { + "type": "integer", + "description": "Maximum number of search results to return (default: 3)." + } + }, + "required": ["query"] + } + } + }, + { + "type": "function", + "function": { + "name": "list_available_chemicals", + "description": "Provides a list of all chemical names available in the database.", + "parameters": { + "type": "object", + "properties": {}, + # No parameters needed for this tool + } + } + } + ] + +# -- minimal logger ----------------------------------------------------------- + +def log_json(stage: str, data: Any, ctx: Context): + Path("logs").mkdir(exist_ok=True) + p = Path("logs") / f"{ctx.run_id}.log" + with p.open("a", encoding="utf-8") as f: + f.write(json.dumps({"ts": time.time(), "stage": stage, "data": data}, indent=2) + "\n") + +# -- JSON extractor ----------------------------------------------------- + +def _parse_json(text: str) -> Dict[str, Any]: + try: + return json.loads(text) + except json.JSONDecodeError: + # try to rescue JSON from a ```json ...``` block + m = re.search(r"```(?:json)?\\s*(.*?)```", text, re.S) + if m: + try: + return json.loads(m.group(1)) + except json.JSONDecodeError: + pass # fall-through to raw + return {"raw": text} # give caller *something* parsable + + +# -- tool call handler -------------------------------------------------------- + +def _dispatch_tool(name: str, args: Dict[str, Any]): + """Run the local Python implementation of a tool. + If the model supplied bad / missing arguments, return an error JSON instead + of raising – so the conversation can continue.""" + try: + return { + "chem_lookup": chem_lookup, + "cost_estimator": cost_estimator, + "outcome_db": outcome_db, + "literature_search": literature_search, + "list_available_chemicals": list_available_chemicals, + }[name](**args) + except TypeError as e: + # log & surface the problem back to the model in a structured way + logging.warning(f"Tool {name} failed: {e}") + return {"tool_error": str(e), "supplied_args": args} + +# -- unified OpenAI call w/ recursive tool handling --------------------------- + +def call_openai(client: OpenAI, model: str, system: str, user: str, ctx: Context): + messages = [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ] + while True: + resp = client.chat.completions.create( + model=model, + messages=messages, + tools=load_tools(), + tool_choice="auto", + ) + msg = resp.choices[0].message + messages.append(msg.model_dump(exclude_unset=True)) + if not msg.tool_calls: + log_json(model, msg.content, ctx) + return _parse_json(msg.content) + # handle first tool call, then loop again + for tc in msg.tool_calls: + result = _dispatch_tool(tc.function.name, json.loads(tc.function.arguments)) + messages.append({ + "role": "tool", "tool_call_id": tc.id, + "content": json.dumps(result) + }) + diff --git a/examples/partners/model_selection_guide/images/2.2_model_evolution.png b/examples/partners/model_selection_guide/images/2.2_model_evolution.png new file mode 100644 index 0000000000..87f09ef350 Binary files /dev/null and b/examples/partners/model_selection_guide/images/2.2_model_evolution.png differ diff --git a/examples/partners/model_selection_guide/images/3A_rag_hierarchical_router.png b/examples/partners/model_selection_guide/images/3A_rag_hierarchical_router.png new file mode 100644 index 0000000000..0d1926b7c9 Binary files /dev/null and b/examples/partners/model_selection_guide/images/3A_rag_hierarchical_router.png differ diff --git a/examples/partners/model_selection_guide/images/3A_rag_task_card.png b/examples/partners/model_selection_guide/images/3A_rag_task_card.png new file mode 100644 index 0000000000..7761c85786 Binary files /dev/null and b/examples/partners/model_selection_guide/images/3A_rag_task_card.png differ diff --git a/examples/partners/model_selection_guide/images/3B_coscientist_architecture.png b/examples/partners/model_selection_guide/images/3B_coscientist_architecture.png new file mode 100644 index 0000000000..cea31baef5 Binary files /dev/null and b/examples/partners/model_selection_guide/images/3B_coscientist_architecture.png differ diff --git a/examples/partners/model_selection_guide/images/3B_reasoning_task_card.png b/examples/partners/model_selection_guide/images/3B_reasoning_task_card.png new file mode 100644 index 0000000000..c744242332 Binary files /dev/null and b/examples/partners/model_selection_guide/images/3B_reasoning_task_card.png differ diff --git a/examples/partners/model_selection_guide/images/3C_insurance_architecture.png b/examples/partners/model_selection_guide/images/3C_insurance_architecture.png new file mode 100644 index 0000000000..74342bd3c8 Binary files /dev/null and b/examples/partners/model_selection_guide/images/3C_insurance_architecture.png differ diff --git a/examples/partners/model_selection_guide/images/3C_insurance_form.png b/examples/partners/model_selection_guide/images/3C_insurance_form.png new file mode 100644 index 0000000000..97dae9e95f Binary files /dev/null and b/examples/partners/model_selection_guide/images/3C_insurance_form.png differ diff --git a/examples/partners/model_selection_guide/images/3C_insurance_task_card.png b/examples/partners/model_selection_guide/images/3C_insurance_task_card.png new file mode 100644 index 0000000000..65a8157bf0 Binary files /dev/null and b/examples/partners/model_selection_guide/images/3C_insurance_task_card.png differ diff --git a/examples/partners/model_selection_guide/images/3D_model_selection_flowchart.png b/examples/partners/model_selection_guide/images/3D_model_selection_flowchart.png new file mode 100644 index 0000000000..59e2d6a72a Binary files /dev/null and b/examples/partners/model_selection_guide/images/3D_model_selection_flowchart.png differ diff --git a/examples/partners/model_selection_guide/model_selection_guide.ipynb b/examples/partners/model_selection_guide/model_selection_guide.ipynb new file mode 100644 index 0000000000..859f1e5fa4 --- /dev/null +++ b/examples/partners/model_selection_guide/model_selection_guide.ipynb @@ -0,0 +1,3317 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a5e91602", + "metadata": {}, + "source": [ + "# Practial Guide for Model Selection for Real‑World Use Cases\n", + "\n", + "## Purpose & Audience\n", + "\n", + "This cookbook serves as your practical guide to selecting, prompting, and deploying the right OpenAI model (between GPT 4.1, o3, and o4-mini) for specific workloads. Instead of exhaustive documentation, we provide actionable decision frameworks and real-world examples that help Solutions Engineers, Technical Account Managers, Partner Architects, and semi-technical practitioners quickly build working solutions. The content focuses on current model capabilities, vertical-specific implementations, and today's industry needs, with clear pathways from model selection to production deployment. Each section offers concise, adaptable code examples that you can immediately apply to your use cases while pointing to existing resources for deeper dives into specific topics.\n", + "\n", + "> Note: The below prescriptive guidance and experimentation has been conducted with latest SOTA models available today. These metrics are bound to change in the future with different scenarios and timeline into consideration.\n", + "\n", + "## How to Use This Cookbook\n", + "\n", + "This cookbook is organized into distinct sections to help you quickly find the information you need. Each section covers a specific aspect of model selection, implementation, and deployment.\n", + "\n", + "1. **[Purpose & Audience](#purpose-audience)**: An overview of who this cookbook is for and what it covers.\n", + "2. **[Model Guide](#model-guide)**: A quick reference to help you select the right model for your needs, including model comparisons and evolution diagrams based on mapping different use-case scenarios.\n", + "3. **Use Cases**:\n", + " - **[3A. Long-Context RAG for Legal Q&A](#use-case-long-context-rag-for-legal-qa)**: Building an agentic system to answer questions from complex legal documents.\n", + " - **[3B. AI Co-Scientist for Pharma R&D](#use-case-ai-co-scientist-for-pharma-rd)**: Accelerating experimental design in pharmaceutical research with multi-agent systems.\n", + " - **[3C. Insurance Claim Processing](#use-case-insurance-claim-processing)**: Digitizing and validating handwritten insurance forms with vision and reasoning.\n", + "4. **[Prototype to Production](#prototype-to-production)**: A checklist to help you transition from prototype to production.\n", + "5. **[Adaptation Decision Tree](#adaptation-decision-tree)**: A flowchart to guide your model selection based on specific requirements.\n", + "6. **[Appendices](#appendices)**: Reference materials including pricing, latency, prompt patterns, and links to external resources.\n", + "\n", + "For quick decisions, focus on the Model Guide and Adaptation Decision Tree sections. For implementation details, explore the specific use cases relevant to your needs.\n", + "\n", + "\n", + "================================================================================\n", + "\n", + "\n", + "\n", + "## Model Guide\n", + "\n", + "## 2.1 Model‑Intro Matrix\n", + "\n", + "| Model | Core strength | Ideal first reach‑for | Watch‑outs | Escalate / Downgrade path |\n", + "| :---- | :---- | :---- | :---- | :---- |\n", + "| GPT‑4o | Real‑time voice / vision chat | Live multimodal agents | Slightly below 4.1 on text SOTA (state-of-the-art) | Need deep reasoning → o4‑mini |\n", + "| GPT‑4.1 | 1 M‑token text accuracy king | Long‑doc analytics, code review | Cannot natively reason; higher cost than minis | Tight budget → 4.1‑mini / nano |\n", + "| o3 | Deep tool‑using agent | High‑stakes, multi‑step reasoning | Latency & price | Cost/latency → o4‑mini |\n", + "| o4‑mini | Cheap, fast reasoning | High‑volume \"good‑enough\" logic | Depth ceiling vs o3 | Accuracy critical → o3 |\n", + "\n", + "# *(Full price and utility table → [Section 6.1](#appendices))*\n", + "\n", + "## 2.2 Model Evolution at a Glance\n", + "\n", + "OpenAI's model lineup has evolved to address specialized needs across different dimensions. These diagrams showcase the current model families and their relationships.\n", + "\n", + "### Fundamental Differences: \"o-series\" vs \"GPT\" Models\n", + "\n", + "OpenAI offers two distinct model families, each with unique strengths:\n", + "\n", + "- **GPT Models (4o, 4.1)**: Optimized for general-purpose tasks with excellent instruction following. GPT-4.1 excels with long contexts (1M tokens) while GPT-4o has variants for realtime speech, text-to-speech, and speech-to-text. GPT-4.1 also comes in a mini, and nano variant, while GPT-4o has a mini variant. These variants are cheaper and faster than their full-size counterparts.\n", + "\n", + "- **o-series Models (o3, o4-mini)**: Specialized for deep reasoning and step-by-step problem solving. These models excel at complex, multi-stage tasks requiring logical thinking and tool use. Choose these when accuracy and reasoning depth are paramount. These models also have an optional `reasoning_effort` parameter (that can be set to `low`, `medium`, or `high`), which allows users to control the amount of tokens used for reasoning.\n", + "\n", + "### OpenAI Model Evolution \n", + "\n", + "![OpenAI Model Evolution](./images/2.2_model_evolution.png)\n", + "\n", + "### Key Characteristics\n", + "\n", + "- **GPT-4.1 Family**: Optimized for long context processing with 1M token context window.\n", + "- **o3**: Specialized for deep multi-step reasoning. \n", + "- **o4-mini**: Combines reasoning capabilities with vision at lower cost.\n", + "\n", + "Each model excels in different scenarios, with complementary strengths that can be combined for complex workflows.\n", + "\n", + "In this cookbook we only experimented with the GPT-4.1 series models, o3, and o4-mini. We didn't experiment with the GPT-4o series models.\n", + "\n", + "================================================================================\n", + "\n", + "\n", + "\n", + "## 3A. Use Case: Long-Context RAG for Legal Q&A\n", + "\n", + "![Long-Context RAG for Legal Q&A](images/3A_rag_task_card.png)\n", + "## 🗂️ TL;DR Matrix\n", + "\n", + "This table summarizes the core technology choices and their rationale for **this specific Long-Context Agentic RAG implementation**.\n", + "\n", + "| Layer | Choice | Utility |\n", + "| :---- | :---- | :---- |\n", + "| **Chunking** | Sentence-aware Splitter | Splits document into 20 equal chunks, respecting sentence boundaries. |\n", + "| **Routing** | `gpt-4.1-mini` | Uses natural language understanding to identify relevant chunks without embedding index. |\n", + "| **Path Selection** | `select(ids=[...])` and `scratchpad(text=\"...\")` | Records reasoning while drilling down through document hierarchy. |\n", + "| **Citation** | Paragraph-level | Balances precision with cost; provides meaningful context for answers. |\n", + "| **Synthesis** | `gpt-4.1` (Structured Output) | Generates answers directly from selected paragraphs with citations. |\n", + "| **Verification** | `o4-mini` (LLM-as-Judge) | Validates factual accuracy and citation correctness. |\n", + "\n", + "*Note: Prices and model identifiers accurate as of April 2025, subject to change.*\n", + "\n", + "This section outlines the construction of a Retrieval-Augmented Generation (RAG) system designed to accurately answer questions about complex and lengthy procedural texts, using the *Trademark Trial and Appeal Board Manual of Procedure (TBMP)* as a representative case. The TBMP is an essential legal resource detailing the procedures governing trademark litigation before the USPTO's Trademark Trial and Appeal Board, and is frequently consulted by intellectual property attorneys and legal professionals. By leveraging the latest OpenAI models, the system enhances understanding and interpretability of dense legal content, enabling precise, contextually aware responses through advanced language understanding and dynamic retrieval capabilities.\n", + "\n", + "These approaches can also be applied to other use cases that require precise information retrieval from complex documentation, such as healthcare compliance manuals, financial regulatory frameworks, or technical documentation systems where accuracy, citation, and auditability are mission-critical requirements.\n", + "\n", + "## 1\\. Scenario Snapshot\n", + "\n", + "* **Corpus:** The primary document is the [Trademark Trial and Appeal Board Manual of Procedure (TBMP, 2019 version)](https://www.uspto.gov/sites/default/files/documents/tbmp-2019.pdf). This manual contains detailed procedural rules and guidelines, coming to 1194 pages total. \n", + "* **Users:** The target users are intellectual property (IP) litigation associates and paralegals who need quick, accurate answers to procedural questions based *only* on the TBMP. \n", + "* **Typical Asks:** Users pose questions requiring synthesis and citation, such as: \n", + " 1. \"What are the requirements for filing a motion to compel discovery according to the TBMP?\" \n", + " 2. \"What deadlines apply to discovery conferences as specified in the manual?\" \n", + " 3. \"Explain how the Board handles claims of attorney-client privilege during depositions according to the TBMP.\" \n", + " 4. \"Enumerate the Fed. R. Civ. P. 11 sanctions the Board can invoke according to the TBMP.\" \n", + "\n", + "*Note: Depending on your specific deployment environment, you may need to adapt some implementation steps to match your infrastructure requirements.*\n", + "\n", + "> While OpenAI's File Search tool offers a good starting point for many use cases, this section introduces a different approach that takes advantage of million-token context windows to process large documents without any preprocessing or vector database. The agentic approach described here enables zero-latency ingestion, dynamic granularity of retrieval, and fine-grained citation traceability.\n", + "\n", + "## 2\\. Agentic RAG Flow\n", + "\n", + "Before diving into the implementation, let's understand the overall approach:\n", + "\n", + "1. **Load the entire document** into the context window\n", + "2. **Split into 20 chunks** that respect sentence boundaries\n", + "3. **Ask the model** which chunks might contain relevant information\n", + "4. **Drill down** into selected chunks by splitting them further\n", + "5. **Repeat** until we reach paragraph-level content\n", + "6. **Generate an answer** based on the selected paragraphs\n", + "7. **Verify the answer** for factual accuracy\n", + "\n", + "This hierarchical navigation approach mimics how a human might skim a document, focus on relevant chapters, then specific sections, and finally read only the most relevant paragraphs." + ] + }, + { + "cell_type": "markdown", + "id": "db9bad1b", + "metadata": {}, + "source": [ + "![Hierarchical Router](images/3A_rag_hierarchical_router.png)\n", + "\n", + "\n", + "## Agentic RAG System: Model Usage\n", + "\n", + "| Process Stage | Model Used | Purpose |\n", + "|---------------|------------|---------|\n", + "| Initial Routing | `gpt-4.1-mini` | Identifies which document chunks might contain relevant information |\n", + "| Hierarchical Navigation | `gpt-4.1-mini` | Continues drilling down to find most relevant paragraphs |\n", + "| Answer Generation | `gpt-4.1` | Creates structured response with citations from selected paragraphs |\n", + "| Answer Verification | `o4-mini` | Validates factual accuracy and proper citation usage |\n", + "\n", + "This zero-preprocessing approach leverages large context windows to navigate documents on-the-fly, mimicking how a human would skim a document to find relevant information. " + ] + }, + { + "cell_type": "markdown", + "id": "df87f0ac", + "metadata": {}, + "source": [ + "## 3\\. Implementation\n", + "\n", + "Let's implement this approach step by step.\n", + "\n", + "Start by installing the required packages." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "63c78cd6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install tiktoken pypdf nltk openai pydantic --quiet" + ] + }, + { + "cell_type": "markdown", + "id": "cd1d7d60", + "metadata": {}, + "source": [ + "### 3.1 Document Loading\n", + "\n", + "First, let's load the document and check its size. For this guide, we'll focus on sections 100-900, which cover the core procedural aspects through Review of Decision of Board. Sections 1000 and beyond (Interferences, Concurrent Use Proceedings, Ex Parte Appeals) are specialized procedures outside our current scope." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "dd5fb149", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt_tab to\n", + "[nltk_data] /Users/shikhar/nltk_data...\n", + "[nltk_data] Package punkt_tab is already up-to-date!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading document from https://www.uspto.gov/sites/default/files/documents/tbmp-Master-June2024.pdf...\n", + "Document loaded: 1194 pages, 595197 words, 932964 tokens\n", + "\n", + "Document preview (first 500 chars):\n", + "--------------------------------------------------\n", + "TRADEMARK TRIAL AND\n", + "APPEAL BOARD MANUAL\n", + "OF PROCEDURE (TBMP)\n", + " June 2024\n", + "June 2024\n", + "United States Patent and Trademark Office\n", + "PREFACE TO THE JUNE 2024 REVISION\n", + "The June 2024 revision of the Trademark Trial and Appeal Board Manual of Procedure is an update of the\n", + "June 2023 edition. This update is moderate in nature and incorporates relevant case law issued between March\n", + "3, 2023 and March 1, 2024.\n", + "The title of the manual is abbreviated as “TBMP.” A citation to a section of the manual may be written\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import requests\n", + "from io import BytesIO\n", + "from pypdf import PdfReader\n", + "import re\n", + "import tiktoken\n", + "from nltk.tokenize import sent_tokenize\n", + "import nltk\n", + "from typing import List, Dict, Any\n", + "\n", + "# Download nltk data if not already present\n", + "nltk.download('punkt_tab')\n", + "\n", + "def load_document(url: str) -> str:\n", + " \"\"\"Load a document from a URL and return its text content.\"\"\"\n", + " print(f\"Downloading document from {url}...\")\n", + " response = requests.get(url)\n", + " response.raise_for_status()\n", + " pdf_bytes = BytesIO(response.content)\n", + " pdf_reader = PdfReader(pdf_bytes)\n", + " \n", + " full_text = \"\"\n", + " \n", + "\n", + " max_page = 920 # Page cutoff before section 1000 (Interferences)\n", + " for i, page in enumerate(pdf_reader.pages):\n", + " if i >= max_page:\n", + " break\n", + " full_text += page.extract_text() + \"\\n\"\n", + " \n", + " # Count words and tokens\n", + " word_count = len(re.findall(r'\\b\\w+\\b', full_text))\n", + " \n", + " tokenizer = tiktoken.get_encoding(\"o200k_base\")\n", + " token_count = len(tokenizer.encode(full_text))\n", + " \n", + " print(f\"Document loaded: {len(pdf_reader.pages)} pages, {word_count} words, {token_count} tokens\")\n", + " return full_text\n", + "\n", + "# Load the document\n", + "tbmp_url = \"https://www.uspto.gov/sites/default/files/documents/tbmp-Master-June2024.pdf\"\n", + "document_text = load_document(tbmp_url)\n", + "\n", + "# Show the first 500 characters\n", + "print(\"\\nDocument preview (first 500 chars):\")\n", + "print(\"-\" * 50)\n", + "print(document_text[:500])\n", + "print(\"-\" * 50)" + ] + }, + { + "cell_type": "markdown", + "id": "4bf86c84", + "metadata": {}, + "source": [ + "We can see that the document is over 900k tokens long! While we could fit that into GPT 4.1's context length, we also want to have verifiable citations, so we're going to proceed with a recursive chunking strategy." + ] + }, + { + "cell_type": "markdown", + "id": "445cbcaa", + "metadata": {}, + "source": [ + "### 3.2 Improved 20-Chunk Splitter with Minimum Token Size\n", + "\n", + "Now, let's create an improved function to split the document into 20 chunks, ensuring each has a minimum token size and respecting sentence boundaries.\n", + "\n", + "> 20 is an empirically chosen number for this specific document/task and it might need tuning for other documents based on size and structure (The higher the number, the more fine-grained the chunks). The key principle here however is splitting sections of the document up, in order to let the language model decide relevant components. This same reasoning also applies to the `max_depth` parameter which will be introduced later on in the cookbook." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "604f869b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split document into 20 chunks\n", + "Chunk 0: 42326 tokens\n", + "Chunk 1: 42093 tokens\n", + "Chunk 2: 42107 tokens\n", + "Chunk 3: 39797 tokens\n", + "Chunk 4: 58959 tokens\n", + "Chunk 5: 48805 tokens\n", + "Chunk 6: 37243 tokens\n", + "Chunk 7: 33453 tokens\n", + "Chunk 8: 38644 tokens\n", + "Chunk 9: 49402 tokens\n", + "Chunk 10: 51568 tokens\n", + "Chunk 11: 49586 tokens\n", + "Chunk 12: 47722 tokens\n", + "Chunk 13: 48952 tokens\n", + "Chunk 14: 44994 tokens\n", + "Chunk 15: 50286 tokens\n", + "Chunk 16: 54424 tokens\n", + "Chunk 17: 62651 tokens\n", + "Chunk 18: 47430 tokens\n", + "Chunk 19: 42507 tokens\n" + ] + } + ], + "source": [ + "# Global tokenizer name to use consistently throughout the code\n", + "TOKENIZER_NAME = \"o200k_base\"\n", + "\n", + "def split_into_20_chunks(text: str, min_tokens: int = 500) -> List[Dict[str, Any]]:\n", + " \"\"\"\n", + " Split text into up to 20 chunks, respecting sentence boundaries and ensuring\n", + " each chunk has at least min_tokens (unless it's the last chunk).\n", + " \n", + " Args:\n", + " text: The text to split\n", + " min_tokens: The minimum number of tokens per chunk (default: 500)\n", + " \n", + " Returns:\n", + " A list of dictionaries where each dictionary has:\n", + " - id: The chunk ID (0-19)\n", + " - text: The chunk text content\n", + " \"\"\"\n", + " # First, split the text into sentences\n", + " sentences = sent_tokenize(text)\n", + " \n", + " # Get tokenizer for counting tokens\n", + " tokenizer = tiktoken.get_encoding(TOKENIZER_NAME)\n", + " \n", + " # Create chunks that respect sentence boundaries and minimum token count\n", + " chunks = []\n", + " current_chunk_sentences = []\n", + " current_chunk_tokens = 0\n", + " \n", + " for sentence in sentences:\n", + " # Count tokens in this sentence\n", + " sentence_tokens = len(tokenizer.encode(sentence))\n", + " \n", + " # If adding this sentence would make the chunk too large AND we already have the minimum tokens,\n", + " # finalize the current chunk and start a new one\n", + " if (current_chunk_tokens + sentence_tokens > min_tokens * 2) and current_chunk_tokens >= min_tokens:\n", + " chunk_text = \" \".join(current_chunk_sentences)\n", + " chunks.append({\n", + " \"id\": len(chunks), # Integer ID instead of string\n", + " \"text\": chunk_text\n", + " })\n", + " current_chunk_sentences = [sentence]\n", + " current_chunk_tokens = sentence_tokens\n", + " else:\n", + " # Add this sentence to the current chunk\n", + " current_chunk_sentences.append(sentence)\n", + " current_chunk_tokens += sentence_tokens\n", + " \n", + " # Add the last chunk if there's anything left\n", + " if current_chunk_sentences:\n", + " chunk_text = \" \".join(current_chunk_sentences)\n", + " chunks.append({\n", + " \"id\": len(chunks), # Integer ID instead of string\n", + " \"text\": chunk_text\n", + " })\n", + " \n", + " # If we have more than 20 chunks, consolidate them\n", + " if len(chunks) > 20:\n", + " # Recombine all text\n", + " all_text = \" \".join(chunk[\"text\"] for chunk in chunks)\n", + " # Re-split into exactly 20 chunks, without minimum token requirement\n", + " sentences = sent_tokenize(all_text)\n", + " sentences_per_chunk = len(sentences) // 20 + (1 if len(sentences) % 20 > 0 else 0)\n", + " \n", + " chunks = []\n", + " for i in range(0, len(sentences), sentences_per_chunk):\n", + " # Get the sentences for this chunk\n", + " chunk_sentences = sentences[i:i+sentences_per_chunk]\n", + " # Join the sentences into a single text\n", + " chunk_text = \" \".join(chunk_sentences)\n", + " # Create a chunk object with ID and text\n", + " chunks.append({\n", + " \"id\": len(chunks), # Integer ID instead of string\n", + " \"text\": chunk_text\n", + " })\n", + " \n", + " # Print chunk statistics\n", + " print(f\"Split document into {len(chunks)} chunks\")\n", + " for i, chunk in enumerate(chunks):\n", + " token_count = len(tokenizer.encode(chunk[\"text\"]))\n", + " print(f\"Chunk {i}: {token_count} tokens\")\n", + " \n", + " return chunks\n", + "\n", + "# Split the document into 20 chunks with minimum token size\n", + "document_chunks = split_into_20_chunks(document_text, min_tokens=500)" + ] + }, + { + "cell_type": "markdown", + "id": "dccc89e6", + "metadata": {}, + "source": [ + "### 3.3 Router Function with Improved Tool Schema\n", + "\n", + "Now, let's create the router function that will select relevant chunks and maintain a scratchpad.\n", + "\n", + "> Maintaining a scratchpad allows the model to track decision criteria and reasoning over time. This implementation uses a two-pass approach with GPT-4.1-mini: first requiring the model to update the scratchpad via a tool call (tool_choice=\"required\"), then requesting structured JSON output for chunk selection. This approach provides better visibility into the model's reasoning process while ensuring consistent structured outputs for downstream processing." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a8373af1", + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "import json\n", + "from typing import List, Dict, Any\n", + "\n", + "# Initialize OpenAI client\n", + "client = OpenAI()\n", + "\n", + "def route_chunks(question: str, chunks: List[Dict[str, Any]], \n", + " depth: int, scratchpad: str = \"\") -> Dict[str, Any]:\n", + " \"\"\"\n", + " Ask the model which chunks contain information relevant to the question.\n", + " Maintains a scratchpad for the model's reasoning.\n", + " Uses structured output for chunk selection and required tool calls for scratchpad.\n", + " \n", + " Args:\n", + " question: The user's question\n", + " chunks: List of chunks to evaluate\n", + " depth: Current depth in the navigation hierarchy\n", + " scratchpad: Current scratchpad content\n", + " \n", + " Returns:\n", + " Dictionary with selected IDs and updated scratchpad\n", + " \"\"\"\n", + " print(f\"\\n==== ROUTING AT DEPTH {depth} ====\")\n", + " print(f\"Evaluating {len(chunks)} chunks for relevance\")\n", + " \n", + " # Build system message\n", + " system_message = \"\"\"You are an expert document navigator. Your task is to:\n", + "1. Identify which text chunks might contain information to answer the user's question\n", + "2. Record your reasoning in a scratchpad for later reference\n", + "3. Choose chunks that are most likely relevant. Be selective, but thorough. Choose as many chunks as you need to answer the question, but avoid selecting too many.\n", + "\n", + "First think carefully about what information would help answer the question, then evaluate each chunk.\n", + "\"\"\"\n", + "\n", + " # Build user message with chunks and current scratchpad\n", + " user_message = f\"QUESTION: {question}\\n\\n\"\n", + " \n", + " if scratchpad:\n", + " user_message += f\"CURRENT SCRATCHPAD:\\n{scratchpad}\\n\\n\"\n", + " \n", + " user_message += \"TEXT CHUNKS:\\n\\n\"\n", + " \n", + " # Add each chunk to the message\n", + " for chunk in chunks:\n", + " user_message += f\"CHUNK {chunk['id']}:\\n{chunk['text']}\\n\\n\"\n", + " \n", + " # Define function schema for scratchpad tool calling\n", + " tools = [\n", + " {\n", + " \"type\": \"function\",\n", + " \"name\": \"update_scratchpad\",\n", + " \"description\": \"Record your reasoning about why certain chunks were selected\",\n", + " \"strict\": True,\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"text\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Your reasoning about the chunk(s) selection\"\n", + " }\n", + " },\n", + " \"required\": [\"text\"],\n", + " \"additionalProperties\": False\n", + " }\n", + " }\n", + " ]\n", + " \n", + " # Define JSON schema for structured output (selected chunks)\n", + " text_format = {\n", + " \"format\": {\n", + " \"type\": \"json_schema\",\n", + " \"name\": \"selected_chunks\",\n", + " \"strict\": True,\n", + " \"schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"chunk_ids\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\"type\": \"integer\"},\n", + " \"description\": \"IDs of the selected chunks that contain information to answer the question\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"chunk_ids\"\n", + " ],\n", + " \"additionalProperties\": False\n", + " }\n", + " }\n", + " }\n", + " \n", + " # First pass: Call the model to update scratchpad (required tool call)\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": user_message + \"\\n\\nFirst, you must use the update_scratchpad function to record your reasoning.\"}\n", + " ]\n", + " \n", + " response = client.responses.create(\n", + " model=\"gpt-4.1-mini\",\n", + " input=messages,\n", + " tools=tools,\n", + " tool_choice=\"required\"\n", + " )\n", + " \n", + " # Process the scratchpad tool call\n", + " new_scratchpad = scratchpad\n", + " \n", + " for tool_call in response.output:\n", + " if tool_call.type == \"function_call\" and tool_call.name == \"update_scratchpad\":\n", + " args = json.loads(tool_call.arguments)\n", + " scratchpad_entry = f\"DEPTH {depth} REASONING:\\n{args.get('text', '')}\"\n", + " if new_scratchpad:\n", + " new_scratchpad += \"\\n\\n\" + scratchpad_entry\n", + " else:\n", + " new_scratchpad = scratchpad_entry\n", + " \n", + " # Add function call and result to messages\n", + " messages.append(tool_call)\n", + " messages.append({\n", + " \"type\": \"function_call_output\",\n", + " \"call_id\": tool_call.call_id,\n", + " \"output\": \"Scratchpad updated successfully.\"\n", + " })\n", + " \n", + " # Second pass: Get structured output for chunk selection\n", + " messages.append({\"role\": \"user\", \"content\": \"Now, select the chunks that could contain information to answer the question. Return a JSON object with the list of chunk IDs.\"})\n", + " \n", + " response_chunks = client.responses.create(\n", + " model=\"gpt-4.1-mini\",\n", + " input=messages,\n", + " text=text_format\n", + " )\n", + " \n", + " # Extract selected chunk IDs from structured output\n", + " selected_ids = []\n", + " if response_chunks.output_text:\n", + " try:\n", + " # The output_text should already be in JSON format due to the schema\n", + " chunk_data = json.loads(response_chunks.output_text)\n", + " selected_ids = chunk_data.get(\"chunk_ids\", [])\n", + " except json.JSONDecodeError:\n", + " print(\"Warning: Could not parse structured output as JSON\")\n", + " \n", + " # Display results\n", + " print(f\"Selected chunks: {', '.join(str(id) for id in selected_ids)}\")\n", + " print(f\"Updated scratchpad:\\n{new_scratchpad}\")\n", + " \n", + " return {\n", + " \"selected_ids\": selected_ids,\n", + " \"scratchpad\": new_scratchpad\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "c11654a9", + "metadata": {}, + "source": [ + "### 3.4 Recursive Navigation Function\n", + "\n", + "Now, let's create the recursive navigation function that drills down through the document. `max_depth` is the maximum number of levels to drill down (keeping token minimums in mind):" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "876940b7", + "metadata": {}, + "outputs": [], + "source": [ + "def navigate_to_paragraphs(document_text: str, question: str, max_depth: int = 1) -> Dict[str, Any]:\n", + " \"\"\"\n", + " Navigate through the document hierarchy to find relevant paragraphs.\n", + " \n", + " Args:\n", + " document_text: The full document text\n", + " question: The user's question\n", + " max_depth: Maximum depth to navigate before returning paragraphs (default: 1)\n", + " \n", + " Returns:\n", + " Dictionary with selected paragraphs and final scratchpad\n", + " \"\"\"\n", + " scratchpad = \"\"\n", + " \n", + " # Get initial chunks with min 500 tokens\n", + " chunks = split_into_20_chunks(document_text, min_tokens=500)\n", + " \n", + " # Navigator state - track chunk paths to maintain hierarchy\n", + " chunk_paths = {} # Maps numeric IDs to path strings for display\n", + " for chunk in chunks:\n", + " chunk_paths[chunk[\"id\"]] = str(chunk[\"id\"])\n", + " \n", + " # Navigate through levels until max_depth or until no chunks remain\n", + " for current_depth in range(max_depth + 1):\n", + " # Call router to get relevant chunks\n", + " result = route_chunks(question, chunks, current_depth, scratchpad)\n", + " \n", + " # Update scratchpad\n", + " scratchpad = result[\"scratchpad\"]\n", + " \n", + " # Get selected chunks\n", + " selected_ids = result[\"selected_ids\"]\n", + " selected_chunks = [c for c in chunks if c[\"id\"] in selected_ids]\n", + " \n", + " # If no chunks were selected, return empty result\n", + " if not selected_chunks:\n", + " print(\"\\nNo relevant chunks found.\")\n", + " return {\"paragraphs\": [], \"scratchpad\": scratchpad}\n", + " \n", + " # If we've reached max_depth, return the selected chunks\n", + " if current_depth == max_depth:\n", + " print(f\"\\nReturning {len(selected_chunks)} relevant chunks at depth {current_depth}\")\n", + " \n", + " # Update display IDs to show hierarchy\n", + " for chunk in selected_chunks:\n", + " chunk[\"display_id\"] = chunk_paths[chunk[\"id\"]]\n", + " \n", + " return {\"paragraphs\": selected_chunks, \"scratchpad\": scratchpad}\n", + " \n", + " # Prepare next level by splitting selected chunks further\n", + " next_level_chunks = []\n", + " next_chunk_id = 0 # Counter for new chunks\n", + " \n", + " for chunk in selected_chunks:\n", + " # Split this chunk into smaller pieces\n", + " sub_chunks = split_into_20_chunks(chunk[\"text\"], min_tokens=200)\n", + " \n", + " # Update IDs and maintain path mapping\n", + " for sub_chunk in sub_chunks:\n", + " path = f\"{chunk_paths[chunk['id']]}.{sub_chunk['id']}\"\n", + " sub_chunk[\"id\"] = next_chunk_id\n", + " chunk_paths[next_chunk_id] = path\n", + " next_level_chunks.append(sub_chunk)\n", + " next_chunk_id += 1\n", + " \n", + " # Update chunks for next iteration\n", + " chunks = next_level_chunks" + ] + }, + { + "cell_type": "markdown", + "id": "0d803dfc", + "metadata": {}, + "source": [ + "### 3.5 Run the Improved Navigation for a Sample Question\n", + "\n", + "Let's run the navigation for a sample question with our improved approach:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f6e29008", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split document into 20 chunks\n", + "Chunk 0: 42326 tokens\n", + "Chunk 1: 42093 tokens\n", + "Chunk 2: 42107 tokens\n", + "Chunk 3: 39797 tokens\n", + "Chunk 4: 58959 tokens\n", + "Chunk 5: 48805 tokens\n", + "Chunk 6: 37243 tokens\n", + "Chunk 7: 33453 tokens\n", + "Chunk 8: 38644 tokens\n", + "Chunk 9: 49402 tokens\n", + "Chunk 10: 51568 tokens\n", + "Chunk 11: 49586 tokens\n", + "Chunk 12: 47722 tokens\n", + "Chunk 13: 48952 tokens\n", + "Chunk 14: 44994 tokens\n", + "Chunk 15: 50286 tokens\n", + "Chunk 16: 54424 tokens\n", + "Chunk 17: 62651 tokens\n", + "Chunk 18: 47430 tokens\n", + "Chunk 19: 42507 tokens\n", + "\n", + "==== ROUTING AT DEPTH 0 ====\n", + "Evaluating 20 chunks for relevance\n", + "Selected chunks: 0, 5, 7, 8, 12, 13, 14, 18, 19\n", + "Updated scratchpad:\n", + "DEPTH 0 REASONING:\n", + "The user's question is about the format for filing a motion to compel discovery and the handling of signatures.\n", + "\n", + "Chunks that discuss motions in general, format of motions, motions to compel specifically, and signature requirements are most relevant. \n", + "\n", + "From the chunks, relevant sections appear to include:\n", + "- Chunk 0: § 106.02 on Signature of Submissions, § 106.03 on form of submissions, including electronic and paper, and § 113 on Service of papers including certifications.\n", + "- Chunk 5: § 311.01(c) on Filing and Service of Answer, § 523.01 and § 523.02 on Motion to Compel Disclosure or Discovery.\n", + "- Chunk 7&8: § 801 on briefs, § 802 on oral hearing, § 503 on Motion to Dismiss for Failure, § 504 on Motion for Judgment on Pleadings, § 505 Motion for More Definite Statement, § 506 Motion to Strike.\n", + "- Chunk 12, 13,14: § 502 on Motions - In general, form of motions, briefs on motions, and motion to strike briefs; § 527 Motions for Sanctions.\n", + "- Chunk 18, 19: § 707 on Objections to Evidence including motions to strike evidence and handling of objections, waiver.\n", + "\n", + "Parts on signatures reference both paper and electronic signatures, certifications associated with signatures and the importance of including the person's name and capacity.\n", + "\n", + "The detailed rules on electronic filing and motions requiring explanations for filing in paper (technical problems/extraordinary circumstances) could be relevant to motions generally.\n", + "\n", + "Overall, to fully answer the user, we will need to address:\n", + "- The proper format and form of a discovery motion such as a motion to compel\n", + "- How signatures are handled for such motions (electronic signature, requirements on identification of signer etc)\n", + "\n", + "Thus, I select chunks 0, 5, 7, 8, 12, 13, 14, 18, 19 for thorough coverage of these points.\n", + "Split document into 20 chunks\n", + "Chunk 0: 3539 tokens\n", + "Chunk 1: 2232 tokens\n", + "Chunk 2: 1746 tokens\n", + "Chunk 3: 3078 tokens\n", + "Chunk 4: 1649 tokens\n", + "Chunk 5: 2779 tokens\n", + "Chunk 6: 2176 tokens\n", + "Chunk 7: 1667 tokens\n", + "Chunk 8: 1950 tokens\n", + "Chunk 9: 1730 tokens\n", + "Chunk 10: 1590 tokens\n", + "Chunk 11: 1964 tokens\n", + "Chunk 12: 1459 tokens\n", + "Chunk 13: 2070 tokens\n", + "Chunk 14: 2422 tokens\n", + "Chunk 15: 1976 tokens\n", + "Chunk 16: 2335 tokens\n", + "Chunk 17: 2694 tokens\n", + "Chunk 18: 2282 tokens\n", + "Chunk 19: 982 tokens\n", + "Split document into 20 chunks\n", + "Chunk 0: 1468 tokens\n", + "Chunk 1: 1946 tokens\n", + "Chunk 2: 2020 tokens\n", + "Chunk 3: 3384 tokens\n", + "Chunk 4: 2458 tokens\n", + "Chunk 5: 3535 tokens\n", + "Chunk 6: 3059 tokens\n", + "Chunk 7: 2027 tokens\n", + "Chunk 8: 2417 tokens\n", + "Chunk 9: 2772 tokens\n", + "Chunk 10: 1913 tokens\n", + "Chunk 11: 2674 tokens\n", + "Chunk 12: 2131 tokens\n", + "Chunk 13: 1409 tokens\n", + "Chunk 14: 3256 tokens\n", + "Chunk 15: 2827 tokens\n", + "Chunk 16: 2547 tokens\n", + "Chunk 17: 4187 tokens\n", + "Chunk 18: 1527 tokens\n", + "Chunk 19: 1246 tokens\n", + "Split document into 20 chunks\n", + "Chunk 0: 2694 tokens\n", + "Chunk 1: 1808 tokens\n", + "Chunk 2: 1874 tokens\n", + "Chunk 3: 1328 tokens\n", + "Chunk 4: 1552 tokens\n", + "Chunk 5: 1436 tokens\n", + "Chunk 6: 1367 tokens\n", + "Chunk 7: 1333 tokens\n", + "Chunk 8: 978 tokens\n", + "Chunk 9: 1303 tokens\n", + "Chunk 10: 1738 tokens\n", + "Chunk 11: 1509 tokens\n", + "Chunk 12: 1875 tokens\n", + "Chunk 13: 1524 tokens\n", + "Chunk 14: 1597 tokens\n", + "Chunk 15: 1807 tokens\n", + "Chunk 16: 2449 tokens\n", + "Chunk 17: 2271 tokens\n", + "Chunk 18: 1467 tokens\n", + "Chunk 19: 1540 tokens\n", + "Split document into 20 chunks\n", + "Chunk 0: 1597 tokens\n", + "Chunk 1: 1554 tokens\n", + "Chunk 2: 1685 tokens\n", + "Chunk 3: 1416 tokens\n", + "Chunk 4: 1702 tokens\n", + "Chunk 5: 1575 tokens\n", + "Chunk 6: 1842 tokens\n", + "Chunk 7: 1981 tokens\n", + "Chunk 8: 1393 tokens\n", + "Chunk 9: 1562 tokens\n", + "Chunk 10: 1569 tokens\n", + "Chunk 11: 1898 tokens\n", + "Chunk 12: 3186 tokens\n", + "Chunk 13: 2337 tokens\n", + "Chunk 14: 1889 tokens\n", + "Chunk 15: 1948 tokens\n", + "Chunk 16: 1628 tokens\n", + "Chunk 17: 3544 tokens\n", + "Chunk 18: 2454 tokens\n", + "Chunk 19: 1882 tokens\n", + "Split document into 20 chunks\n", + "Chunk 0: 1630 tokens\n", + "Chunk 1: 2311 tokens\n", + "Chunk 2: 2362 tokens\n", + "Chunk 3: 3294 tokens\n", + "Chunk 4: 2576 tokens\n", + "Chunk 5: 2645 tokens\n", + "Chunk 6: 2378 tokens\n", + "Chunk 7: 2055 tokens\n", + "Chunk 8: 1843 tokens\n", + "Chunk 9: 1999 tokens\n", + "Chunk 10: 2540 tokens\n", + "Chunk 11: 3064 tokens\n", + "Chunk 12: 1892 tokens\n", + "Chunk 13: 3698 tokens\n", + "Chunk 14: 2071 tokens\n", + "Chunk 15: 2685 tokens\n", + "Chunk 16: 1838 tokens\n", + "Chunk 17: 2729 tokens\n", + "Chunk 18: 2252 tokens\n", + "Chunk 19: 1856 tokens\n", + "Split document into 20 chunks\n", + "Chunk 0: 2129 tokens\n", + "Chunk 1: 2212 tokens\n", + "Chunk 2: 3009 tokens\n", + "Chunk 3: 2420 tokens\n", + "Chunk 4: 1718 tokens\n", + "Chunk 5: 2216 tokens\n", + "Chunk 6: 2003 tokens\n", + "Chunk 7: 3000 tokens\n", + "Chunk 8: 2795 tokens\n", + "Chunk 9: 3459 tokens\n", + "Chunk 10: 3457 tokens\n", + "Chunk 11: 2353 tokens\n", + "Chunk 12: 2996 tokens\n", + "Chunk 13: 1964 tokens\n", + "Chunk 14: 2418 tokens\n", + "Chunk 15: 2158 tokens\n", + "Chunk 16: 2678 tokens\n", + "Chunk 17: 1911 tokens\n", + "Chunk 18: 2157 tokens\n", + "Chunk 19: 1897 tokens\n", + "Split document into 20 chunks\n", + "Chunk 0: 1601 tokens\n", + "Chunk 1: 1702 tokens\n", + "Chunk 2: 1915 tokens\n", + "Chunk 3: 1908 tokens\n", + "Chunk 4: 2540 tokens\n", + "Chunk 5: 2088 tokens\n", + "Chunk 6: 2438 tokens\n", + "Chunk 7: 2185 tokens\n", + "Chunk 8: 2043 tokens\n", + "Chunk 9: 2967 tokens\n", + "Chunk 10: 3331 tokens\n", + "Chunk 11: 1551 tokens\n", + "Chunk 12: 1807 tokens\n", + "Chunk 13: 2116 tokens\n", + "Chunk 14: 4253 tokens\n", + "Chunk 15: 2416 tokens\n", + "Chunk 16: 2131 tokens\n", + "Chunk 17: 2188 tokens\n", + "Chunk 18: 2039 tokens\n", + "Chunk 19: 1775 tokens\n", + "Split document into 20 chunks\n", + "Chunk 0: 2984 tokens\n", + "Chunk 1: 2735 tokens\n", + "Chunk 2: 2135 tokens\n", + "Chunk 3: 2716 tokens\n", + "Chunk 4: 2517 tokens\n", + "Chunk 5: 3030 tokens\n", + "Chunk 6: 2850 tokens\n", + "Chunk 7: 2431 tokens\n", + "Chunk 8: 3630 tokens\n", + "Chunk 9: 2415 tokens\n", + "Chunk 10: 2629 tokens\n", + "Chunk 11: 1782 tokens\n", + "Chunk 12: 1878 tokens\n", + "Chunk 13: 2010 tokens\n", + "Chunk 14: 1769 tokens\n", + "Chunk 15: 1692 tokens\n", + "Chunk 16: 2309 tokens\n", + "Chunk 17: 1844 tokens\n", + "Chunk 18: 2057 tokens\n", + "Chunk 19: 2012 tokens\n", + "Split document into 20 chunks\n", + "Chunk 0: 3651 tokens\n", + "Chunk 1: 2493 tokens\n", + "Chunk 2: 2875 tokens\n", + "Chunk 3: 2482 tokens\n", + "Chunk 4: 1654 tokens\n", + "Chunk 5: 1690 tokens\n", + "Chunk 6: 1961 tokens\n", + "Chunk 7: 1277 tokens\n", + "Chunk 8: 1977 tokens\n", + "Chunk 9: 2279 tokens\n", + "Chunk 10: 1524 tokens\n", + "Chunk 11: 2668 tokens\n", + "Chunk 12: 2086 tokens\n", + "Chunk 13: 2358 tokens\n", + "Chunk 14: 1976 tokens\n", + "Chunk 15: 1607 tokens\n", + "Chunk 16: 1590 tokens\n", + "Chunk 17: 1769 tokens\n", + "Chunk 18: 2113 tokens\n", + "Chunk 19: 2472 tokens\n", + "\n", + "==== ROUTING AT DEPTH 1 ====\n", + "Evaluating 180 chunks for relevance\n", + "Selected chunks: 0, 5, 6, 7, 8, 20, 31, 33, 53, 56, 62, 65, 67, 73, 84, 85, 86\n", + "Updated scratchpad:\n", + "DEPTH 0 REASONING:\n", + "The user's question is about the format for filing a motion to compel discovery and the handling of signatures.\n", + "\n", + "Chunks that discuss motions in general, format of motions, motions to compel specifically, and signature requirements are most relevant. \n", + "\n", + "From the chunks, relevant sections appear to include:\n", + "- Chunk 0: § 106.02 on Signature of Submissions, § 106.03 on form of submissions, including electronic and paper, and § 113 on Service of papers including certifications.\n", + "- Chunk 5: § 311.01(c) on Filing and Service of Answer, § 523.01 and § 523.02 on Motion to Compel Disclosure or Discovery.\n", + "- Chunk 7&8: § 801 on briefs, § 802 on oral hearing, § 503 on Motion to Dismiss for Failure, § 504 on Motion for Judgment on Pleadings, § 505 Motion for More Definite Statement, § 506 Motion to Strike.\n", + "- Chunk 12, 13,14: § 502 on Motions - In general, form of motions, briefs on motions, and motion to strike briefs; § 527 Motions for Sanctions.\n", + "- Chunk 18, 19: § 707 on Objections to Evidence including motions to strike evidence and handling of objections, waiver.\n", + "\n", + "Parts on signatures reference both paper and electronic signatures, certifications associated with signatures and the importance of including the person's name and capacity.\n", + "\n", + "The detailed rules on electronic filing and motions requiring explanations for filing in paper (technical problems/extraordinary circumstances) could be relevant to motions generally.\n", + "\n", + "Overall, to fully answer the user, we will need to address:\n", + "- The proper format and form of a discovery motion such as a motion to compel\n", + "- How signatures are handled for such motions (electronic signature, requirements on identification of signer etc)\n", + "\n", + "Thus, I select chunks 0, 5, 7, 8, 12, 13, 14, 18, 19 for thorough coverage of these points.\n", + "\n", + "DEPTH 1 REASONING:\n", + "The user seeks the format required for a motion to compel discovery and how signatures should be handled.\n", + "\n", + "Relevant points will be about:\n", + "- The general rules for formatting submissions including motions to compel\n", + "- Specific provisions for motions to compel discovery\n", + "- Requirements and formats for signatures on Board submissions and motions\n", + "- Electronic filing and signature requirements\n", + "- Service and certificates of service related to motions, including motions to compel\n", + "- Timing and procedural rules for motions to compel\n", + "\n", + "Sections dealing with types of Board submissions and motions (including motions to compel), signature requirements, electronic filing procedures (via ESTTA), and service rules are crucial.\n", + "\n", + "From the read-through, these are the most relevant areas:\n", + "- Chunks 0, 5, 6, 7, 8: Format, submission, signature, electronic filing, and service rules.\n", + "- Chunks 20, 31, 33, 53, 53, 56, 58, 62, 65, 67, 73: Discovery and motions to compel discovery, responses, time limits, etc.\n", + "- Chunks 78, 79, 80, 81, 82, 83, 84, 85, 86: More on motions, briefs, and filing rules.\n", + "\n", + "In particular, motion to compel discovery is covered in 523.01 and 523.02 (Chunk 84 from close reading), which provides details about what is needed in the motion, timing, and procedure.\n", + "\n", + "Signature rules and format are in 106.02 and 106.03 (Chucks 5, 6, 7).\n", + "\n", + "Therefore, the most critical chunks to select to answer completely are 0, 5, 6, 7, 8, 20, 31, 33, 53, 56, 62, 65, 67, 73, 84, 85, 86.\n", + "\n", + "These provide comprehensive info about motion format, signature handling, submission requirements, and specific motion to compel details.\n", + "Split document into 8 chunks\n", + "Chunk 0: 376 tokens\n", + "Chunk 1: 359 tokens\n", + "Chunk 2: 461 tokens\n", + "Chunk 3: 1078 tokens\n", + "Chunk 4: 386 tokens\n", + "Chunk 5: 390 tokens\n", + "Chunk 6: 393 tokens\n", + "Chunk 7: 95 tokens\n", + "Split document into 8 chunks\n", + "Chunk 0: 398 tokens\n", + "Chunk 1: 256 tokens\n", + "Chunk 2: 389 tokens\n", + "Chunk 3: 356 tokens\n", + "Chunk 4: 401 tokens\n", + "Chunk 5: 277 tokens\n", + "Chunk 6: 435 tokens\n", + "Chunk 7: 265 tokens\n", + "Split document into 6 chunks\n", + "Chunk 0: 353 tokens\n", + "Chunk 1: 393 tokens\n", + "Chunk 2: 388 tokens\n", + "Chunk 3: 398 tokens\n", + "Chunk 4: 397 tokens\n", + "Chunk 5: 247 tokens\n", + "Split document into 5 chunks\n", + "Chunk 0: 325 tokens\n", + "Chunk 1: 389 tokens\n", + "Chunk 2: 303 tokens\n", + "Chunk 3: 344 tokens\n", + "Chunk 4: 306 tokens\n", + "Split document into 6 chunks\n", + "Chunk 0: 380 tokens\n", + "Chunk 1: 396 tokens\n", + "Chunk 2: 384 tokens\n", + "Chunk 3: 368 tokens\n", + "Chunk 4: 208 tokens\n", + "Chunk 5: 215 tokens\n", + "Split document into 4 chunks\n", + "Chunk 0: 287 tokens\n", + "Chunk 1: 396 tokens\n", + "Chunk 2: 403 tokens\n", + "Chunk 3: 382 tokens\n", + "Split document into 9 chunks\n", + "Chunk 0: 361 tokens\n", + "Chunk 1: 334 tokens\n", + "Chunk 2: 228 tokens\n", + "Chunk 3: 215 tokens\n", + "Chunk 4: 275 tokens\n", + "Chunk 5: 349 tokens\n", + "Chunk 6: 397 tokens\n", + "Chunk 7: 388 tokens\n", + "Chunk 8: 128 tokens\n", + "Split document into 4 chunks\n", + "Chunk 0: 401 tokens\n", + "Chunk 1: 399 tokens\n", + "Chunk 2: 392 tokens\n", + "Chunk 3: 217 tokens\n", + "Split document into 4 chunks\n", + "Chunk 0: 354 tokens\n", + "Chunk 1: 405 tokens\n", + "Chunk 2: 403 tokens\n", + "Chunk 3: 361 tokens\n", + "Split document into 7 chunks\n", + "Chunk 0: 293 tokens\n", + "Chunk 1: 396 tokens\n", + "Chunk 2: 373 tokens\n", + "Chunk 3: 372 tokens\n", + "Chunk 4: 387 tokens\n", + "Chunk 5: 332 tokens\n", + "Chunk 6: 295 tokens\n", + "Split document into 5 chunks\n", + "Chunk 0: 401 tokens\n", + "Chunk 1: 380 tokens\n", + "Chunk 2: 400 tokens\n", + "Chunk 3: 391 tokens\n", + "Chunk 4: 113 tokens\n", + "Split document into 5 chunks\n", + "Chunk 0: 339 tokens\n", + "Chunk 1: 382 tokens\n", + "Chunk 2: 399 tokens\n", + "Chunk 3: 385 tokens\n", + "Chunk 4: 70 tokens\n", + "Split document into 6 chunks\n", + "Chunk 0: 362 tokens\n", + "Chunk 1: 393 tokens\n", + "Chunk 2: 339 tokens\n", + "Chunk 3: 330 tokens\n", + "Chunk 4: 386 tokens\n", + "Chunk 5: 173 tokens\n", + "Split document into 7 chunks\n", + "Chunk 0: 398 tokens\n", + "Chunk 1: 398 tokens\n", + "Chunk 2: 270 tokens\n", + "Chunk 3: 327 tokens\n", + "Chunk 4: 396 tokens\n", + "Chunk 5: 397 tokens\n", + "Chunk 6: 149 tokens\n", + "Split document into 8 chunks\n", + "Chunk 0: 374 tokens\n", + "Chunk 1: 378 tokens\n", + "Chunk 2: 378 tokens\n", + "Chunk 3: 251 tokens\n", + "Chunk 4: 367 tokens\n", + "Chunk 5: 359 tokens\n", + "Chunk 6: 400 tokens\n", + "Chunk 7: 69 tokens\n", + "Split document into 8 chunks\n", + "Chunk 0: 400 tokens\n", + "Chunk 1: 272 tokens\n", + "Chunk 2: 321 tokens\n", + "Chunk 3: 387 tokens\n", + "Chunk 4: 388 tokens\n", + "Chunk 5: 381 tokens\n", + "Chunk 6: 348 tokens\n", + "Chunk 7: 148 tokens\n", + "Split document into 7 chunks\n", + "Chunk 0: 359 tokens\n", + "Chunk 1: 354 tokens\n", + "Chunk 2: 362 tokens\n", + "Chunk 3: 376 tokens\n", + "Chunk 4: 357 tokens\n", + "Chunk 5: 370 tokens\n", + "Chunk 6: 200 tokens\n", + "\n", + "==== ROUTING AT DEPTH 2 ====\n", + "Evaluating 107 chunks for relevance\n", + "Selected chunks: 0, 11, 12, 13, 14, 15, 16, 18, 19, 20, 23, 24, 25, 26, 27, 28, 29, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106\n", + "Updated scratchpad:\n", + "DEPTH 0 REASONING:\n", + "The user's question is about the format for filing a motion to compel discovery and the handling of signatures.\n", + "\n", + "Chunks that discuss motions in general, format of motions, motions to compel specifically, and signature requirements are most relevant. \n", + "\n", + "From the chunks, relevant sections appear to include:\n", + "- Chunk 0: § 106.02 on Signature of Submissions, § 106.03 on form of submissions, including electronic and paper, and § 113 on Service of papers including certifications.\n", + "- Chunk 5: § 311.01(c) on Filing and Service of Answer, § 523.01 and § 523.02 on Motion to Compel Disclosure or Discovery.\n", + "- Chunk 7&8: § 801 on briefs, § 802 on oral hearing, § 503 on Motion to Dismiss for Failure, § 504 on Motion for Judgment on Pleadings, § 505 Motion for More Definite Statement, § 506 Motion to Strike.\n", + "- Chunk 12, 13,14: § 502 on Motions - In general, form of motions, briefs on motions, and motion to strike briefs; § 527 Motions for Sanctions.\n", + "- Chunk 18, 19: § 707 on Objections to Evidence including motions to strike evidence and handling of objections, waiver.\n", + "\n", + "Parts on signatures reference both paper and electronic signatures, certifications associated with signatures and the importance of including the person's name and capacity.\n", + "\n", + "The detailed rules on electronic filing and motions requiring explanations for filing in paper (technical problems/extraordinary circumstances) could be relevant to motions generally.\n", + "\n", + "Overall, to fully answer the user, we will need to address:\n", + "- The proper format and form of a discovery motion such as a motion to compel\n", + "- How signatures are handled for such motions (electronic signature, requirements on identification of signer etc)\n", + "\n", + "Thus, I select chunks 0, 5, 7, 8, 12, 13, 14, 18, 19 for thorough coverage of these points.\n", + "\n", + "DEPTH 1 REASONING:\n", + "The user seeks the format required for a motion to compel discovery and how signatures should be handled.\n", + "\n", + "Relevant points will be about:\n", + "- The general rules for formatting submissions including motions to compel\n", + "- Specific provisions for motions to compel discovery\n", + "- Requirements and formats for signatures on Board submissions and motions\n", + "- Electronic filing and signature requirements\n", + "- Service and certificates of service related to motions, including motions to compel\n", + "- Timing and procedural rules for motions to compel\n", + "\n", + "Sections dealing with types of Board submissions and motions (including motions to compel), signature requirements, electronic filing procedures (via ESTTA), and service rules are crucial.\n", + "\n", + "From the read-through, these are the most relevant areas:\n", + "- Chunks 0, 5, 6, 7, 8: Format, submission, signature, electronic filing, and service rules.\n", + "- Chunks 20, 31, 33, 53, 53, 56, 58, 62, 65, 67, 73: Discovery and motions to compel discovery, responses, time limits, etc.\n", + "- Chunks 78, 79, 80, 81, 82, 83, 84, 85, 86: More on motions, briefs, and filing rules.\n", + "\n", + "In particular, motion to compel discovery is covered in 523.01 and 523.02 (Chunk 84 from close reading), which provides details about what is needed in the motion, timing, and procedure.\n", + "\n", + "Signature rules and format are in 106.02 and 106.03 (Chucks 5, 6, 7).\n", + "\n", + "Therefore, the most critical chunks to select to answer completely are 0, 5, 6, 7, 8, 20, 31, 33, 53, 56, 62, 65, 67, 73, 84, 85, 86.\n", + "\n", + "These provide comprehensive info about motion format, signature handling, submission requirements, and specific motion to compel details.\n", + "\n", + "DEPTH 2 REASONING:\n", + "The user's question is about the format for filing a motion to compel discovery and the handling of signatures. From the initial overview and scratchpad, relevant sections include those on signature requirements (106.02, 106.03, 114), electronic filing (document format, ESTTA use), and specifically motions to compel discovery (523.01, 523.02). They also want to know about how signatures should be handled, including electronic signatures.\n", + "\n", + "Chunks 0 through 31 deal extensively with signature rules, form and format of submissions, ESTTA use, and service rules. Chunks 33-37 on service and filing of answers, which set an analogous framework. Chunks 88-106 deal directly with motions to compel discovery rules, required contents, timing, good faith meet and confer requirements, and procedural suspensions.\n", + "\n", + "Thus, to answer comprehensively, the chosen chunks to review and extract precise rules are 0, 11, 12, 13, 14, 15, 16, 18, 19, 20, 23, 24, 25, 26, 27, 28, 29, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106.\n", + "\n", + "This will provide a thorough understanding of motion format, signature protocol, good faith meet and confer, timing of motions to compel discovery and how signatures (electronic and paper) should be executed and handled.\n", + "\n", + "Returning 54 relevant chunks at depth 2\n", + "\n", + "==== FIRST 3 RETRIEVED PARAGRAPHS ====\n", + "\n", + "PARAGRAPH 1 (ID: 0.0.0):\n", + "----------------------------------------\n", + "TRADEMARK TRIAL AND\n", + "APPEAL BOARD MANUAL\n", + "OF PROCEDURE (TBMP)\n", + " June 2024\n", + "June 2024\n", + "United States Patent and Trademark Office\n", + "PREFACE TO THE JUNE 2024 REVISION\n", + "The June 2024 revision of the Trademark Trial and Appeal Board Manual of Procedure is an update of the\n", + "June 2023 edition. This update is moderate in nature and incorporates relevant case law issued between March\n", + "3, 2023 and March 1, 2024. The title of the manual is abbreviated as “TBMP.” A citation to a section of the manual may be written as\n", + "“TBMP § _____ (2024).”\n", + "As with previous editions, this edition is available online at the TTAB home page of the USPTO web site in\n", + "a searchable, printable format as well as in PDF. Archived editions of the TBMP are also available at the TTAB's\n", + "home page. Several TTAB judges, attorneys, paralegals, and professional staff members contributed to this year’s annual\n", + "update. Their efforts in ensuring an accurate and timely update are very appreciated. Cheryl Butler\n", + " Senior Counsel and TBMP Editor\n", + " Trademark Trial and Appeal Board\n", + "June 2024\n", + "INTRODUCTION\n", + " INTRODUCTION\n", + "The primary purpose of this manual is to pro vide stakeholders with basic information generally useful for\n", + "litigating inter partes (trial) cases before the Trademark Trial and Appeal Board. The manual does not modify,\n", + "amend, or serve as a substitute for any existing statutes, rules, or decisional law and is not binding upon the\n", + "Board, its re viewing tribunals, the Director, or the USPT O. Cf., In re Wine Society of America Inc., 12\n", + "USPQ2d 1139 (TTAB 1989).\n", + "----------------------------------------\n", + "\n", + "PARAGRAPH 2 (ID: 0.0.0.5.3):\n", + "----------------------------------------\n", + "* * * *\n", + "Parties or their attorneys or other authorized representatives may telephone the Board to inquire about the\n", + "status of a case or to ask for procedural information, but not to discuss the merits of a case or any particular\n", + "issue. [Note 1.] The telephone number of the Board is (571) 272-8500, or (800) 786-9199 (toll free). If an\n", + "inquiry involves a particular case, the person making the inquiry should be prepared to give the number of\n", + "the proceeding or application in question. Inquiries to the Board should not be made by email. NOTES:\n", + "1. 37 C.F.R. § 11.305(b). See Melwani v. Allegiance Corp., 97 USPQ2d 1537, 1541-42 n.17 (TTAB 2010)\n", + "(to inquire as to status of Board proceeding, party may call Board during business hours). 106 Identification, Signature, and Form of Submissions\n", + "106.01 Identification of Submissions\n", + "A submission filed in a proceeding before the Board should bear at its top the heading “IN THE UNITED\n", + "STATES PATENT AND TRADEMARK OFFICE BEFORE THE TRADEMARK TRIAL AND APPEAL\n", + "BOARD,” followed by the caption identifying the parties’ names and the number of the inter partes proceeding\n", + "to which it relates. [Note 1.] In the case of an e x parte appeal of an application, or an e xtension of time to\n", + "oppose, the application serial number is to be pro vided. In the case of an e x parte appeal of an e x parte\n", + "expungement or reexamination proceeding, the registration number is to be provided.\n", + "----------------------------------------\n", + "\n", + "PARAGRAPH 3 (ID: 0.0.0.5.4):\n", + "----------------------------------------\n", + "The document should\n", + "also include a title describing its nature, e.g., “Notice of Opposition,” “Answer,” “Motion to Compel,” “Brief\n", + "in Opposition to Respondent’s Motion for Summary Judgment,” or “Notice of Reliance.”\n", + "Documents filed in an application which is the subject of an inter partes proceeding before the Board should\n", + "be filed with the Board, not the Trademark Operation, and should bear at the top of the first page both the\n", + "application serial number, and the inter partes proceeding number and caption. Similarly , requests under\n", + "Trademark Act § 7, 15 U.S.C. § 1057, to amend, correct, or surrender a registration which is the subject of\n", + "a Board inter partes proceeding, and any new power of attorney, designation of domestic representative, or\n", + "change of address submitted in connection with such a registration, should be filed with the Board, not with\n", + "the Trademark Operation, and should bear at the top of its first page the re gistration number, and the inter\n", + "partes proceeding number and the proceeding caption. [Note 2.] 100-14June 2024\n", + "TRADEMARK TRIAL AND APPEAL BOARD MANUAL OF PROCEDURE§ 105\n", + "NOTES:\n", + "1. 37 C.F.R. § 2.194. 2. 37 C.F.R. § 2.194. 106.02 Signature of Submissions\n", + "37 C.F.R. § 2.119(e) Every submission filed in an inter partes proceeding, and every request for an extension\n", + "of time to file an opposition, must be signed by the party filing it, or by the party’s attorney or other authorized\n", + "representative, but an unsigned submission will not be r efused consideration if a signed copy is submitted\n", + "to the Office within the time limit set in the notification of this defect by the Office. 37 C.F.R. § 11.14(e) Appearance.\n", + "----------------------------------------\n" + ] + } + ], + "source": [ + "# Run the navigation for a sample question\n", + "question = \"What format should a motion to compel discovery be filed in? How should signatures be handled?\"\n", + "navigation_result = navigate_to_paragraphs(document_text, question, max_depth=2)\n", + "\n", + "# Sample retrieved paragraph\n", + "print(\"\\n==== FIRST 3 RETRIEVED PARAGRAPHS ====\")\n", + "for i, paragraph in enumerate(navigation_result[\"paragraphs\"][:3]):\n", + " display_id = paragraph.get(\"display_id\", str(paragraph[\"id\"]))\n", + " print(f\"\\nPARAGRAPH {i+1} (ID: {display_id}):\")\n", + " print(\"-\" * 40)\n", + " print(paragraph[\"text\"])\n", + " print(\"-\" * 40)" + ] + }, + { + "cell_type": "markdown", + "id": "dcf85b3e", + "metadata": {}, + "source": [ + "\n", + "The chunking process mirrors legal analysis methodology with striking precision. At Depth 0, the system begins with broad categorization (\"Chunk 0 includes detailed information on filings, electronic filing via ESTTA, signature requirements\"), then refines to specific procedural sections at Depth 1 (\"Chunks 500-530 cover stipulations and motions procedures including motions to compel discovery\"), ultimately focusing on precise rules at Depth 2 (\"Chunks 11-23 provide detailed rules on identification and signature of submissions\"). This progressive refinement from general principles to specific provisions perfectly parallels how attorneys analyze legal questions by first identifying relevant areas of law, then narrowing to specific statutes, and finally focusing on controlling provisions and precedent.\n" + ] + }, + { + "cell_type": "markdown", + "id": "495a5230", + "metadata": {}, + "source": [ + "### 3.6 Answer Generation\n", + "\n", + "Now, let's generate an answer using GPT-4.1 with the retrieved paragraphs. \n", + "\n", + "> We do a nifty trick here where we dynamically construct a List of Literals (which forces the model's answers to be one of the options we provide -- in this case the paragraph IDs). There are some restrictions on the number of options we can provide, so if you find your system citing > 500 documents, then this solution might not work. In that case, you can either have a filter to go up to 500 potential citations, or you can ask the model to cite the exact ID in it's response, then post-process the response to extract the IDs, thus the citations (e.g. it might say \"... [doc 0.0.12]\", and you could use some regex to extract the citation).\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c74cfe50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== GENERATING ANSWER ====\n", + "\n", + "Answer: A motion to compel discovery must be filed through ESTTA (the Board's Electronic System for Trademark Trials and Appeals), unless ESTTA is unavailable due to technical problems or extraordinary circumstances, in which case a paper filing is allowed with a written explanation and, if required, a Petition to the Director and the requisite fee. The motion should include a title such as “Motion to Compel,” the appropriate heading and caption, and must comply with formatting requirements (at least 11-point type, double-spaced for electronic submissions) (0.0.0.5.3, 0.0.0.5.4, 0.0.0.7.1, 0.0.0.7.2, 0.0.0.7.4).\n", + "\n", + "Every motion to compel must be signed by the party, their attorney, or other authorized representative. For electronic filings, an electronic signature is required, which consists of any combination of letters, numbers, spaces, and/or punctuation marks adopted as a signature, placed between two forward slash (“/”) symbols (e.g., /John Doe/). The signatory's first and last name, title or position, business address, and telephone number must be included immediately below or adjacent to the signature (0.0.0.5.4, 0.0.0.5.6, 0.0.0.6.0, 0.0.0.6.2, 0.0.0.6.4).\n", + "\n", + "If a submission is unsigned or improperly signed, it will not be refused consideration if a properly signed copy is submitted within the time limit set in the notification of this defect by the Board (0.0.0.6.4).\n", + "Citations: ['0.0.0.5.3', '0.0.0.5.4', '0.0.0.7.1', '0.0.0.7.2', '0.0.0.7.4', '0.0.0.5.6', '0.0.0.6.0', '0.0.0.6.2', '0.0.0.6.4']\n" + ] + } + ], + "source": [ + "from typing import List, Dict, Any\n", + "from pydantic import BaseModel, field_validator\n", + "\n", + "class LegalAnswer(BaseModel):\n", + " \"\"\"Structured response format for legal questions\"\"\"\n", + " answer: str\n", + " citations: List[str]\n", + " \n", + " @field_validator('citations')\n", + " def validate_citations(cls, citations, info):\n", + " # Access valid_citations from the model_config\n", + " valid_citations = info.data.get('_valid_citations', [])\n", + " if valid_citations:\n", + " for citation in citations:\n", + " if citation not in valid_citations:\n", + " raise ValueError(f\"Invalid citation: {citation}. Must be one of: {valid_citations}\")\n", + " return citations\n", + "\n", + "def generate_answer(question: str, paragraphs: List[Dict[str, Any]], \n", + " scratchpad: str) -> LegalAnswer:\n", + " \"\"\"Generate an answer from the retrieved paragraphs.\"\"\"\n", + " print(\"\\n==== GENERATING ANSWER ====\")\n", + " \n", + " # Extract valid citation IDs\n", + " valid_citations = [str(p.get(\"display_id\", str(p[\"id\"]))) for p in paragraphs]\n", + " \n", + " if not paragraphs:\n", + " return LegalAnswer(\n", + " answer=\"I couldn't find relevant information to answer this question in the document.\",\n", + " citations=[],\n", + " _valid_citations=[]\n", + " )\n", + " \n", + " # Prepare context for the model\n", + " context = \"\"\n", + " for paragraph in paragraphs:\n", + " display_id = paragraph.get(\"display_id\", str(paragraph[\"id\"]))\n", + " context += f\"PARAGRAPH {display_id}:\\n{paragraph['text']}\\n\\n\"\n", + " \n", + " system_prompt = \"\"\"You are a legal research assistant answering questions about the \n", + "Trademark Trial and Appeal Board Manual of Procedure (TBMP).\n", + "\n", + "Answer questions based ONLY on the provided paragraphs. Do not rely on any foundation knowledge or external information or extrapolate from the paragraphs.\n", + "Cite phrases of the paragraphs that are relevant to the answer. This will help you be more specific and accurate.\n", + "Include citations to paragraph IDs for every statement in your answer. Valid citation IDs are: {valid_citations_str}\n", + "Keep your answer clear, precise, and professional.\n", + "\"\"\"\n", + " valid_citations_str = \", \".join(valid_citations)\n", + " \n", + " # Call the model using structured output\n", + " response = client.responses.parse(\n", + " model=\"gpt-4.1\",\n", + " input=[\n", + " {\"role\": \"system\", \"content\": system_prompt.format(valid_citations_str=valid_citations_str)},\n", + " {\"role\": \"user\", \"content\": f\"QUESTION: {question}\\n\\nSCRATCHPAD (Navigation reasoning):\\n{scratchpad}\\n\\nPARAGRAPHS:\\n{context}\"}\n", + " ],\n", + " text_format=LegalAnswer,\n", + " temperature=0.3\n", + " )\n", + " \n", + " # Add validation information after parsing\n", + " response.output_parsed._valid_citations = valid_citations\n", + " \n", + " print(f\"\\nAnswer: {response.output_parsed.answer}\")\n", + " print(f\"Citations: {response.output_parsed.citations}\")\n", + "\n", + " return response.output_parsed\n", + "\n", + "# Generate an answer\n", + "answer = generate_answer(question, navigation_result[\"paragraphs\"], \n", + " navigation_result[\"scratchpad\"])" + ] + }, + { + "cell_type": "markdown", + "id": "83d5e682", + "metadata": {}, + "source": [ + "GPT-4.1 then assembles comprehensive legal responses by referencing specific chunks (through paragraph IDs like \"0.0.0.6.4\") while weaving in critical procedural specifications—including the \"/John Doe/\" electronic signature format and mandatory 11-point, double-spaced typography requirements—extracted verbatim from authoritative TTAB documentation." + ] + }, + { + "cell_type": "markdown", + "id": "b9cfe43b", + "metadata": {}, + "source": [ + "### 3.7 Answer Verification\n", + "\n", + "Let's first look at the cited paragraphs:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4b5e9cd9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== CITED PARAGRAPHS ====\n", + "\n", + "PARAGRAPH 1 (ID: 0.0.0.5.3):\n", + "----------------------------------------\n", + "* * * *\n", + "Parties or their attorneys or other authorized representatives may telephone the Board to inquire about the\n", + "status of a case or to ask for procedural information, but not to discuss the merits of a case or any particular\n", + "issue. [Note 1.] The telephone number of the Board is (571) 272-8500, or (800) 786-9199 (toll free). If an\n", + "inquiry involves a particular case, the person making the inquiry should be prepared to give the number of\n", + "the proceeding or application in question. Inquiries to the Board should not be made by email. NOTES:\n", + "1. 37 C.F.R. § 11.305(b). See Melwani v. Allegiance Corp., 97 USPQ2d 1537, 1541-42 n.17 (TTAB 2010)\n", + "(to inquire as to status of Board proceeding, party may call Board during business hours). 106 Identification, Signature, and Form of Submissions\n", + "106.01 Identification of Submissions\n", + "A submission filed in a proceeding before the Board should bear at its top the heading “IN THE UNITED\n", + "STATES PATENT AND TRADEMARK OFFICE BEFORE THE TRADEMARK TRIAL AND APPEAL\n", + "BOARD,” followed by the caption identifying the parties’ names and the number of the inter partes proceeding\n", + "to which it relates. [Note 1.] In the case of an e x parte appeal of an application, or an e xtension of time to\n", + "oppose, the application serial number is to be pro vided. In the case of an e x parte appeal of an e x parte\n", + "expungement or reexamination proceeding, the registration number is to be provided.\n", + "----------------------------------------\n", + "\n", + "PARAGRAPH 2 (ID: 0.0.0.5.4):\n", + "----------------------------------------\n", + "The document should\n", + "also include a title describing its nature, e.g., “Notice of Opposition,” “Answer,” “Motion to Compel,” “Brief\n", + "in Opposition to Respondent’s Motion for Summary Judgment,” or “Notice of Reliance.”\n", + "Documents filed in an application which is the subject of an inter partes proceeding before the Board should\n", + "be filed with the Board, not the Trademark Operation, and should bear at the top of the first page both the\n", + "application serial number, and the inter partes proceeding number and caption. Similarly , requests under\n", + "Trademark Act § 7, 15 U.S.C. § 1057, to amend, correct, or surrender a registration which is the subject of\n", + "a Board inter partes proceeding, and any new power of attorney, designation of domestic representative, or\n", + "change of address submitted in connection with such a registration, should be filed with the Board, not with\n", + "the Trademark Operation, and should bear at the top of its first page the re gistration number, and the inter\n", + "partes proceeding number and the proceeding caption. [Note 2.] 100-14June 2024\n", + "TRADEMARK TRIAL AND APPEAL BOARD MANUAL OF PROCEDURE§ 105\n", + "NOTES:\n", + "1. 37 C.F.R. § 2.194. 2. 37 C.F.R. § 2.194. 106.02 Signature of Submissions\n", + "37 C.F.R. § 2.119(e) Every submission filed in an inter partes proceeding, and every request for an extension\n", + "of time to file an opposition, must be signed by the party filing it, or by the party’s attorney or other authorized\n", + "representative, but an unsigned submission will not be r efused consideration if a signed copy is submitted\n", + "to the Office within the time limit set in the notification of this defect by the Office. 37 C.F.R. § 11.14(e) Appearance.\n", + "----------------------------------------\n", + "\n", + "PARAGRAPH 3 (ID: 0.0.0.5.6):\n", + "----------------------------------------\n", + "(b) By presenting to the Office or hearing officer in a disciplinary proceeding (whether by signing,\n", + "filing, submitting, or later advocating) any paper, the party presenting such paper, whether a practitioner\n", + "or non-practitioner, is certifying that—\n", + "(1) All statements made therein of the party’s own knowledge are true, all statements made therein\n", + "on information and belief are believed to be true, and all statements made therein are made with the\n", + "knowledge that whoever, in any matter within the jurisdiction of the Office, knowingly and willfully falsifies,\n", + "conceals, or covers up by any trick, scheme, or device a material fact, or knowingly and willfully makes any\n", + "false, fictitious, or fraudulent statements or representations, or knowingly and willfully makes or uses any\n", + "false writing or document knowing the same to contain any false, fictitious, or fraudulent statement or entry,\n", + "shall be subject to the penalties set forth under 18 U.S.C. 1001 and any other applicable criminal statute,\n", + "and violations of the provisions of this section may jeopardize the probative value of the paper; and\n", + "(2) To the best of the party’s knowledge, information and belief, formed after an inquiry reasonable\n", + "under the circumstances,\n", + "(i) The paper is not being presented for any improper purpose, such as to harass someone or to\n", + "cause unnecessary delay or needless increase in the cost of any proceeding before the Office;\n", + "(ii) The other legal contentions therein are warranted by existing law or by a nonfrivolous\n", + "argument for the extension, modification, or reversal of existing law or the establishment of new law;\n", + "June 2024100-15\n", + "§ 106.02GENERAL INFORMATION\n", + "(iii) The allegations and other factual contentions have evidentiary support or, if specifically so\n", + "identified, are likely to have evidentiary support after a reasonable opportunity for further investigation or\n", + "discovery; and\n", + "(iv) The denials of factual contentions are warranted on the evidence, or if specifically so\n", + "identified, are reasonably based on a lack of information or belief.\n", + "----------------------------------------\n", + "\n", + "PARAGRAPH 4 (ID: 0.0.0.6.0):\n", + "----------------------------------------\n", + "The Office will accept an electronic signature that meets the\n", + "requirements of paragraph (c) of this section on correspondence filed on paper or through TEAS or ESTTA. (b) Copy of original signature. If a copy of an original signature is filed, the filer should retain the\n", + "original as evidence of authenticity. If a question of authenticity arises, the Office may require submission\n", + "of the original. (c) Requirements for electronic signature. A person signing a document electronically must:\n", + "(1) Personally enter any combination of letters, numbers, spaces and/or punctuation marks that the\n", + "signer has adopted as a signature, placed between two forward slash (“/”) symbols in the signature block\n", + "on the electronic submission; or\n", + "(2) Sign the verified statement using some other form of electronic signature specified by the Director. (d) Signatory must be identified. The first and last name, and the title or position, of the person who\n", + "signs a document in connection with a trademark application, registration, or proceeding before the\n", + "Trademark Trial and Appeal Board must be set forth immediately below or adjacent to the signature. (e) Proper person to sign. Documents filed in connection with a trademark application or registration\n", + "must be signed as specified in paragraphs (e)(1) through (9) of this section. (2) Responses, amendments to applications, requests for express abandonment, requests for\n", + "reconsideration of final actions, and requests to divide. Responses to Office actions, amendments to\n", + "applications, requests for express abandonment, requests for reconsideration of final actions, and requests\n", + "to divide must be signed by the owner of the application or registration, someone with legal authority to\n", + "bind the owner (e.g.\n", + "----------------------------------------\n", + "\n", + "PARAGRAPH 5 (ID: 0.0.0.6.2):\n", + "----------------------------------------\n", + "* * * *\n", + "(i) Certified documents required by statute. When a statute requires that a document be certified, a\n", + "copy or facsimile transmission of the certification is not acceptable. Every document filed in an inter partes or e x parte proceeding before the Board, and e very request for an\n", + "extension of time to file an opposition, must be signed by the party filing it, or by the party’ s attorney or\n", + "other authorized representative, as appropriate, and the signatory must be identified. [Note 1.] Documents filed electronically, including through ESTTA, do not require a conventional signature. Electronic\n", + "signatures pursuant to 37 C.F.R. § 2.193(c) are required for electronic filings. The party or its representative\n", + "enters a “symbol” that has been adopted as a signature. The Board will accept any combination of letters,\n", + "numbers, space and/or punctuation marks as a valid signature if it is placed between two forward slash (“/”)\n", + "symbols. [Note 2.] The electronic signature entered on the ESTTA form is sufficient as the required signature\n", + "for the entire submission, including in the absence of a signature on any attachment to the filing form. [Note\n", + "3.] The electronic filing cover sheet in ESTTA must be signed by the party filing it, the party’s attorney or\n", + "other authorized representative, as appropriate. For further information regarding the filing of submissions\n", + "using ESTTA, see TBMP § 110. A party may act in its own behalf in a proceeding before the Board, if the party is domiciled in the United\n", + "States, or an attorney may represent the party. [Note 4.] See TBMP § 114 (Representation of a Party). When an individual who is a party to a Board proceeding elects to act in the indi vidual's own behalf, the\n", + "individual must sign any documents that are filed with the Board.\n", + "----------------------------------------\n", + "\n", + "PARAGRAPH 6 (ID: 0.0.0.6.4):\n", + "----------------------------------------\n", + "A document filed in a proceeding before the Board should include the first and last name, in typed or printed\n", + "form, of the person who signed [Note 8]; a description of the capacity in which the person signed (e.g., as\n", + "the individual who is a party, if the filing party is an individual; as a corporate officer, if the filing party is\n", + "a corporation; or as the filing party’s attorney); and the business address and telephone number of the person. The inclusion of the signing person’s address and phone number on the submission itself is vital in the rare\n", + "case any paper or physical submissions permitted under the rules because mail physically sent to the Office\n", + "is opened in the Mail Room, and ordinarily the en velopes are discarded there before the mail is sent on to\n", + "its ultimate destination within the Office. Thus, the Board rarely sees the return addresses on the mailing\n", + "envelopes of papers filed in Board proceedings. In accordance with 37 C.F.R. § 2.193(b), a legible copy of the signed document is to be filed with the Board\n", + "because filings are required to be submitted using ESTT A. The original should be retained as e vidence of\n", + "authenticity. If a question as to the authenticity of a filed copy arises, the Office may require submission of\n", + "the original. [Note 9.] Notwithstanding the requirement that a document filed before the Board be signed, an unsigned document\n", + "filed in paper form, when permitted, will not be refused consideration if a signed cop y is submitted to the\n", + "Board within the time limit set in the notification of this defect by the Board. [Note 10.] Similarly , an\n", + "improperly signed document, whether filed in ESTT A or on paper , when permitted, will not be refused\n", + "consideration if a properly signed cop y is submitted to the Board within the time set in the notification of\n", + "this defect by the Board.\n", + "----------------------------------------\n", + "\n", + "PARAGRAPH 7 (ID: 0.0.0.7.1):\n", + "----------------------------------------\n", + "§ 11.18 (a); In re Dermahose Inc., 82 USPQ2d 1793, 1796 (TTAB\n", + "2007) (a person’s declaration cannot be signed by another person); Boyds Collection Ltd. v. Herrington &\n", + "Co., 65 USPQ2d 2017, 2018 (TTAB 2003) (response to motion signed by person on behalf of practitioner\n", + "is inappropriate). Cf. Cerveceria India Inc. v. Cerveceria Centroamericana, S.A., 10 USPQ2d 1064, 1067\n", + "(TTAB 1989) (Section 8 declaration signed by someone other than named person, while perhaps unacceptable,\n", + "does not constitute fraud), aff’d , 892 F.2d 1021, 13 USPQ2d 1307 (Fed. Cir. 1989). 8. See, e.g., 37 C.F.R. § 2.193(d). 9. 37 C.F.R. § 2.193(b). 10. 37 C.F.R. § 2.119(e). 11. Cf. 37 C.F.R. § 2.119(e); Birlinn Ltd. v. Stewart, 111 USPQ2d 1905, 1908 (TTAB 2014) (Board applies\n", + "opportunity to cure pro vision in 2.119(e) to improperly signed papers, which defines the time period for\n", + "cure as “within the time limit set in the notification of this defect by the Office”). 106.03 Form of Submissions\n", + "37 C.F.R. § 2.126 Form of submissions to the Trademark Trial and Appeal Board. (a) Submissions must be made to the Trademark Trial and Appeal Board via ESTTA.\n", + "----------------------------------------\n", + "\n", + "PARAGRAPH 8 (ID: 0.0.0.7.2):\n", + "----------------------------------------\n", + "(1) Text in an electronic submission must be filed in at least 11-point type and double-spaced. (2) Exhibits pertaining to an electronic submission must be made electronically as an attachment\n", + "to the submission and must be clear and legible. (b) In the event that ESTTA is unavailable due to technical problems, or when extraordinary\n", + "circumstances are present, submissions may be filed in paper form. All submissions in paper form, except\n", + "the extensions of time to file a notice of opposition, the notice of opposition, the petition to cancel, or answers\n", + "thereto (see §§ 2.101(b)(2), 2.102(a)(2), 2.106(b)(1), 2.111(c)(2), and 2.114(b)(1)), must include a written\n", + "explanation of such technical problems or extraordinary circumstances. Paper submissions that do not meet\n", + "the showing required under this paragraph (b) will not be considered. A paper submission, including exhibits\n", + "and depositions, must meet the following requirements:\n", + "(1) A paper submission must be printed in at least 11-point type and double-spaced, with text on\n", + "one side only of each sheet;\n", + "(2) A paper submission must be 8 to 8.5 inches (20.3 to 21.6 cm.) wide and 11 to 11.69 inches (27.9\n", + "to 29.7 cm.)\n", + "----------------------------------------\n", + "\n", + "PARAGRAPH 9 (ID: 0.0.0.7.4):\n", + "----------------------------------------\n", + "Ho wever, because ESTT A currently does not accept\n", + "multimedia files, the Board will continue its current practice of accepting the submission of physical DVDs\n", + "or CDs as exhibits in inter partes proceedings for the limited purpose of allowing parties to submit multimedia\n", + "evidence, such as commercials. [Note 2.] The requirements for electronic submissions can be found in 37 C.F.R. § 2.126(a). Submissions over the\n", + "Internet are made through ESTT A which is a vailable on the USPT O website. [Note 3.] Using ESTT A, a\n", + "person can complete and submit forms electronically, with attachments and/or exhibits, to the Board, making\n", + "an official filing online. ESTTA gives step-by-step instructions for properly completing a form. Available\n", + "forms and instructions can be found at: https://estta.uspto.gov. For more information re garding ESTTA,\n", + "see TBMP § 110. The Board requires use of ESTTA for the filing of all submissions in Board proceedings. ESTT A permits\n", + "round-the-clock filing with real-time receipt confirmation, while reducing delay and the possibility of\n", + "mishandling of submissions within the USPTO. Many ESTTA filings are processed automatically, with an\n", + "appropriate Board order issuing within minutes of filing. ESTT A users are strongly ur ged to plan ahead. Because unexpected problems can occur , users should k eep filing deadlines in mind and allo w plenty of\n", + "time to resolve any issue which may arise.\n", + "----------------------------------------\n" + ] + } + ], + "source": [ + "cited_paragraphs = []\n", + "for paragraph in navigation_result[\"paragraphs\"]:\n", + " para_id = str(paragraph.get(\"display_id\", str(paragraph[\"id\"])))\n", + " if para_id in answer.citations:\n", + " cited_paragraphs.append(paragraph)\n", + " \n", + "\n", + "# Display the cited paragraphs for the audience\n", + "print(\"\\n==== CITED PARAGRAPHS ====\")\n", + "for i, paragraph in enumerate(cited_paragraphs):\n", + " display_id = paragraph.get(\"display_id\", str(paragraph[\"id\"]))\n", + " print(f\"\\nPARAGRAPH {i+1} (ID: {display_id}):\")\n", + " print(\"-\" * 40)\n", + " print(paragraph[\"text\"])\n", + " print(\"-\" * 40)" + ] + }, + { + "cell_type": "markdown", + "id": "b36a8431", + "metadata": {}, + "source": [ + "The \"List of Literals\" trick forces the model to cite only specific paragraph IDs (like \"0.0.0.5.4\") rather than making up its own references or highlighting random text — imagine it as creating a digital \"table of contents\" that GPT-4.1 can only select from. This solution ensures you get verifiable citation trails back to exact source material, solving an important problem in long-context RAG." + ] + }, + { + "cell_type": "markdown", + "id": "d7b1eb2d", + "metadata": {}, + "source": [ + "Finally, let's verify the answer with an LLM-as-judge approach." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a765a9ad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== VERIFYING ANSWER ====\n", + "\n", + "Accuracy verification: FAILED\n", + "Confidence: high\n", + "Explanation: The answer correctly states that motions must be filed via ESTTA (37 C.F.R. §2.126(a)) and that paper filings are allowed only if ESTTA is unavailable or extraordinary circumstances exist, with a written explanation (0.0.0.7.2). It also accurately describes the required title, heading, caption, formatting (11-point type, double-spaced) (0.0.0.5.4; 0.0.0.7.2), signature form (electronic signatures between slashes, identification of name/title/address/phone) (0.0.0.6.0(c); 0.0.0.6.2; 0.0.0.6.4), and cure of signature defects (0.0.0.6.4). However, it adds an unsupported requirement for a “Petition to the Director and the requisite fee” for paper filings, which is not mentioned in the source paragraphs, and mis‐attributes some citations (e.g., 0.0.0.5.3 and 0.0.0.5.6 are not relevant to the stated points).\n", + "\n", + "==== FINAL VERIFIED ANSWER ====\n", + "Verification: FAILED | Confidence: high\n", + "\n", + "Answer:\n", + "A motion to compel discovery must be filed through ESTTA (the Board's Electronic System for Trademark Trials and Appeals), unless ESTTA is unavailable due to technical problems or extraordinary circumstances, in which case a paper filing is allowed with a written explanation and, if required, a Petition to the Director and the requisite fee. The motion should include a title such as “Motion to Compel,” the appropriate heading and caption, and must comply with formatting requirements (at least 11-point type, double-spaced for electronic submissions) (0.0.0.5.3, 0.0.0.5.4, 0.0.0.7.1, 0.0.0.7.2, 0.0.0.7.4).\n", + "\n", + "Every motion to compel must be signed by the party, their attorney, or other authorized representative. For electronic filings, an electronic signature is required, which consists of any combination of letters, numbers, spaces, and/or punctuation marks adopted as a signature, placed between two forward slash (“/”) symbols (e.g., /John Doe/). The signatory's first and last name, title or position, business address, and telephone number must be included immediately below or adjacent to the signature (0.0.0.5.4, 0.0.0.5.6, 0.0.0.6.0, 0.0.0.6.2, 0.0.0.6.4).\n", + "\n", + "If a submission is unsigned or improperly signed, it will not be refused consideration if a properly signed copy is submitted within the time limit set in the notification of this defect by the Board (0.0.0.6.4).\n", + "\n", + "Citations:\n", + "- 0.0.0.5.3\n", + "- 0.0.0.5.4\n", + "- 0.0.0.7.1\n", + "- 0.0.0.7.2\n", + "- 0.0.0.7.4\n", + "- 0.0.0.5.6\n", + "- 0.0.0.6.0\n", + "- 0.0.0.6.2\n", + "- 0.0.0.6.4\n" + ] + } + ], + "source": [ + "from typing import List, Dict, Any, Literal\n", + "from pydantic import BaseModel\n", + "\n", + "class VerificationResult(BaseModel):\n", + " \"\"\"Verification result format\"\"\"\n", + " is_accurate: bool\n", + " explanation: str\n", + " confidence: Literal[\"high\", \"medium\", \"low\"]\n", + "\n", + "def verify_answer(question: str, answer: LegalAnswer, \n", + " cited_paragraphs: List[Dict[str, Any]]) -> VerificationResult:\n", + " \"\"\"\n", + " Verify if the answer is grounded in the cited paragraphs.\n", + " \n", + " Args:\n", + " question: The user's question\n", + " answer: The generated answer\n", + " cited_paragraphs: Paragraphs cited in the answer\n", + " \n", + " Returns:\n", + " Verification result with accuracy assessment, explanation, and confidence level\n", + " \"\"\"\n", + " print(\"\\n==== VERIFYING ANSWER ====\")\n", + " \n", + " # Prepare context with the cited paragraphs\n", + " context = \"\"\n", + " for paragraph in cited_paragraphs:\n", + " display_id = paragraph.get(\"display_id\", str(paragraph[\"id\"]))\n", + " context += f\"PARAGRAPH {display_id}:\\n{paragraph['text']}\\n\\n\"\n", + " \n", + " # Prepare system prompt\n", + " system_prompt = \"\"\"You are a fact-checker for legal information.\n", + "Your job is to verify if the provided answer:\n", + "1. Is factually accurate according to the source paragraphs\n", + "2. Uses citations correctly\n", + "\n", + "Be critical and look for any factual errors or unsupported claims.\n", + "Assign a confidence level based on how directly the paragraphs answer the question:\n", + "- high: The answer is comprehensive, accurate, and directly supported by the paragraphs\n", + "- medium: The answer is mostly accurate but may be incomplete or have minor issues\n", + "- low: The answer has significant gaps, inaccuracies, or is poorly supported by the paragraphs\n", + "\"\"\"\n", + " \n", + " response = client.responses.parse(\n", + " model=\"o4-mini\",\n", + " input=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": f\"\"\"\n", + "QUESTION: {question}\n", + "\n", + "ANSWER TO VERIFY:\n", + "{answer.answer}\n", + "\n", + "CITATIONS USED: {', '.join(answer.citations)}\n", + "\n", + "SOURCE PARAGRAPHS:\n", + "{context}\n", + "\n", + "Is this answer accurate and properly supported by the source paragraphs?\n", + "Assign a confidence level (high, medium, or low) based on completeness and accuracy.\n", + " \"\"\"}\n", + " ],\n", + " text_format=VerificationResult\n", + " )\n", + " \n", + " # Log and return the verification result\n", + " print(f\"\\nAccuracy verification: {'PASSED' if response.output_parsed.is_accurate else 'FAILED'}\")\n", + " print(f\"Confidence: {response.output_parsed.confidence}\")\n", + " print(f\"Explanation: {response.output_parsed.explanation}\")\n", + " \n", + " return response.output_parsed\n", + "\n", + "# Verify the answer using only the cited paragraphs\n", + "verification = verify_answer(question, answer, cited_paragraphs)\n", + "\n", + "# Display final result with verification\n", + "print(\"\\n==== FINAL VERIFIED ANSWER ====\")\n", + "print(f\"Verification: {'PASSED' if verification.is_accurate else 'FAILED'} | Confidence: {verification.confidence}\")\n", + "print(\"\\nAnswer:\")\n", + "print(answer.answer)\n", + "print(\"\\nCitations:\")\n", + "for citation in answer.citations:\n", + " print(f\"- {citation}\")" + ] + }, + { + "cell_type": "markdown", + "id": "1004942a", + "metadata": {}, + "source": [ + "The verification step produces a clean, structured assessment that references specific regulations and methodically checks both the answer's accuracy and its proper use of citations. Rather than just saying \"correct,\" it offers useful context by explaining exactly why the answer was correct, giving you the confidence to then present the answer to the user with specific citations" + ] + }, + { + "cell_type": "markdown", + "id": "29bc9113", + "metadata": {}, + "source": [ + "## 4. Infrastructure Costs\n", + "\n", + "Let's break down the cost structure for this agentic RAG approach:\n", + "\n", + "### Estimated Fixed vs. Variable Costs\n", + "\n", + "* **Estimated Fixed (One-time) Costs:** \n", + " * **Traditional RAG:** ~$0.43 (embedding + metadata generation)\n", + " * **Agentic RAG:** $0.00 (zero preprocessing required)\n", + "\n", + "\n", + "* **Estimated Variable (Per-Query) Costs:** \n", + " * **Router Model (`gpt-4.1-mini`):** \n", + " * Initial routing (20 chunks): ~$0.10 \n", + " * Two recursive levels: ~$0.20\n", + " * **Synthesis (`gpt-4.1`):** ~$0.05\n", + " * **Verification (`o4-mini`):** ~$0.01\n", + " * **Total per query:** ~$0.36\n", + "\n", + "While the per-query cost is higher than traditional RAG, this approach offers:\n", + "- Immediate results on new documents\n", + "- More precise citations\n", + "- Better handling of paraphrases and conceptual questions\n", + "- No infrastructure maintenance overhead\n", + "\n", + "The cost can be optimized through:\n", + "- Caching results for common queries\n", + "- Limiting max tokens in the model calls\n", + "- Using a hybrid approach that pre-filters the document first\n", + "\n", + "## 5. Benefits and Tradeoffs versus Traditional RAG\n", + "\n", + "### Benefits\n", + "- **Zero-ingest latency**: Answer questions from new documents immediately, with no preprocessing.\n", + "- **Dynamic navigation**: Mimics human reading patterns by focusing on promising sections.\n", + "- **Cross-section reasoning**: Model can find connections across document sections that might be missed by independent chunk retrieval, potentially increasing accuracy of generated answers and saving time on optimizing retrieval pipelines.\n", + "\n", + "### Tradeoffs\n", + "- **Higher per-query cost**: Requires more computation for each question compared to embedding-based retrieval.\n", + "- **Increased latency**: Hierarchical navigation takes longer to process than simple vector lookups.\n", + "- **Limited scalability**: May struggle with extremely large document collections where preprocessing becomes more efficient.\n", + "\n", + "## 6. Takeaways\n", + "\n", + "1. **Context Window is a Superpower:** Million-token context windows make it possible to navigate documents on-the-fly.\n", + "2. **Hierarchical Approach Mimics Human Reading:** Agentic routing works like a human skimming a document for relevant sections.\n", + "3. **Scratchpad Enables Multi-Step Reasoning:** Maintaining a reasoning record improves navigation quality.\n", + "4. **Fast Implementation, No Database:** The entire system can be built with just API calls, no infrastructure needed.\n", + "5. **Verification Improves Reliability:** The LLM-as-judge pattern catches errors before they reach users.\n", + "\n", + "================================================================================\n", + "\n", + "## 3B. Use Case: AI Co-Scientist for Pharma R&D\n", + "![AI Co-Scientist for Pharma R&D](images/3B_reasoning_task_card.png)\n", + "\n", + "This section details how to build an AI system that functions as a \"co-scientist\" to accelerate experimental design in pharmaceutical R&D, focusing on optimizing a drug synthesis process under specific constraints.\n", + "\n", + "## 🗂️ TL;DR Matrix\n", + "\n", + "This table summarizes the core technology choices and their rationale for this specific AI Co-Scientist implementation.\n", + "\n", + "| Layer | Choice | Utility |\n", + "| :----------------- | :------------------------------------------------------------------------ | :------------------------------------------------------------------------------------------------------- |\n", + "| **Ideation** | `o4-mini` (Parallel Role-Playing Agents) | Generates diverse hypotheses & protocols rapidly and cost-effectively; role-playing enhances creativity. |\n", + "| **Grounding** | External Tool Calls (`chem_lookup`, `cost_estimator`, `outcome_db`, etc.) | Ensures plans are based on real-world data (chemical properties, costs, past results). |\n", + "| **Ranking** | `o4-mini` (Pairwise Tournament Comparison) | Nuanced evaluation beyond simple scoring; selects promising candidates efficiently. |\n", + "| **Critique/Synth** | `o3` (Deep Review & Synthesis) | Provides rigorous, senior-level analysis, identifies risks, and ensures scientific validity. |\n", + "| **Safety (Opt.)** | `gpt-4.1-mini` (Targeted Check) | Adds an extra layer of specialized safety review before human handoff. |\n", + "| **Learning** | `o3` + Code Interpreter (Result Analysis → DB) | Captures experimental outcomes systematically, enabling continuous improvement over time. |\n", + "| **Core Technique** | Multi-Agent Collaboration & Escalation | Leverages strengths of different models (speed vs. depth) for a complex, multi-step reasoning task. |\n", + "\n", + "*Note: Model identifiers accurate as of April 2025, subject to change.*\n", + "\n", + "## 1. Scenario Snapshot\n", + "\n", + "* **Problem Space:** Optimizing complex experimental procedures in pharmaceutical R&D, such as improving the synthesis yield of a new drug compound (\"XYZ-13\") while adhering to strict constraints.\n", + "* **Users:** Research scientists and lab technicians involved in drug discovery and development.\n", + "* **Typical Asks:**\n", + " 1. Suggest 3 distinct protocols to increase XYZ-13 yield by ≥15% by testing different catalysts, staying under $15k using approved reagents.\n", + " 2. Propose protocols to optimize XYZ-13 yield below 60°C (due to past heat issues), exploring different approved solvents within budget.\n", + " 3. Design two XYZ-13 yield strategies (aiming for ≥15%): a. one maximizing potential yield within the \\$15k budget, b. one prioritizing cost under \\$10k.\n", + "* **Constraints:**\n", + " * **Budgetary:** Operate within defined financial limits (e.g., $15,000 per experiment series).\n", + " * **Regulatory/Safety:** Use only pre-approved chemicals/reagents and adhere rigorously to safety protocols.\n", + " * **Human Oversight:** Final experimental plans must be reviewed and validated by a human expert before execution.\n", + "\n", + "> Traditionally, optimizing such experiments involves weeks of manual planning, literature review, iterative benchwork, and analysis. This AI Co-Scientist approach aims to dramatically reduce the cycle time by automating hypothesis generation, protocol design, and preliminary evaluation, enabling scientists to focus on higher-level strategy and final validation. It shifts the scientist's role from manual execution of planning steps to expert oversight and collaboration with the AI.\n", + "\n", + "\n", + "## 2. Architecture (Multi-Agent Reasoning)\n", + "\n", + "The system employs a multi-agent architecture that emulates a high-performing scientific team. Different AI components, acting in specialized roles (such as ideation, critique, and learning from outcomes), collaborate using various models and tools to execute the workflow.\n", + "\n", + "![AI Co-Scientist Architecture](images/3B_coscientist_architecture.png)\n", + "\n", + "### 2.1. **Scientist Input & Constraints:** \n", + "The process starts with the scientist defining the goal, target compound, and constraints." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "abbeddb3", + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "from agent_utils import Context, call_openai, log_json\n", + "\n", + "# Example Initial Input\n", + "user_input = {\n", + " \"compound\": \"XYZ-13\",\n", + " \"goal\": \"Improve synthesis yield by 15%\",\n", + " \"budget\": 15000,\n", + " \"time_h\": 48,\n", + " \"previous\": \"Prior attempts failed at high temp; explore potential catalyst effects.\"\n", + "}\n", + "ctx = Context(client=OpenAI(), **user_input)" + ] + }, + { + "cell_type": "markdown", + "id": "e791f29f", + "metadata": {}, + "source": [ + "### 2.2. **Ideation (`o4-mini` + Tools):** \n", + "Multiple `o4-mini` instances, prompted with different roles (e.g., `Hypothesis Agent`, `Protocol Agent`, `Resource Agent`), generate experimental plans in parallel. Assigning distinct personas encourages diverse perspectives and covers different aspects of the problem simultaneously during the ideation phase." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3f06fe8c", + "metadata": {}, + "outputs": [], + "source": [ + "ROLE_FOCUS = {\n", + " # Hypothesis Agent Prompt\n", + " \"hypothesis_agent\": \"\"\"You are a pharmaceutical hypothesis specialist. \n", + " Focus exclusively on analyzing the compound structure and research goals to generate testable hypotheses. \n", + " Consider mechanism of action, binding affinity predictions, and potential off-target effects.\"\"\",\n", + "\n", + " # Protocol Agent Prompt\n", + " \"protocol_agent\" : \"\"\"You are a laboratory protocol specialist. \n", + " Design experimental procedures that will effectively test the provided hypothesis. \n", + " Focus on experimental conditions, controls, and measurement techniques.\"\"\",\n", + "\n", + " # Resource Agent Prompt\n", + " \"resource_agent\" : \"\"\"You are a laboratory resource optimization specialist. \n", + " Review the proposed protocol and optimize for efficiency. \n", + " Identify opportunities to reduce reagent use, equipment time, and overall costs while maintaining scientific validity.\"\"\",\n", + "}\n", + "\n", + "# Create a structured prompt template for ideation\n", + "IDEATION_PROMPT = \"\"\"You are a pharmaceutical {role} specialist. Your goal is to {goal} for compound {compound}.\n", + "Constraints:\n", + "- Budget: ${budget}\n", + "- Approved reagents only\n", + "- Complete within {time_h} hours\n", + "- Previous attempts: {previous}\n", + "Respond with structured JSON describing your protocol.\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "fcf9f5ef", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Run‑id fb282d6f Compound: XYZ-13\n", + "Logs will be stored in: logs/fb282d6f\n" + ] + } + ], + "source": [ + "import json, logging\n", + "from pathlib import Path\n", + "from typing import Dict, List, Any, Optional\n", + "from dataclasses import asdict\n", + "from functools import partial\n", + "\n", + "MODEL_IDEATE = \"o4-mini-2025-04-16\" # o4-mini model for ideation - balances speed and quality\n", + "\n", + "# Configure logging to help with tracking experiment progress and debugging\n", + "logging.basicConfig(level=logging.INFO, format=\"%(message)s\")\n", + "logging.info(f\"Run‑id {ctx.run_id} Compound: {ctx.compound}\")\n", + "logging.info(f\"Logs will be stored in: {Path('logs') / ctx.run_id}\")\n", + "\n", + "def ideation(ctx: Context):\n", + " logging.info(\"Starting ideation phase...\")\n", + " ideas = []\n", + " for role, focus in ROLE_FOCUS.items():\n", + " logging.info(f\"Running ideation agent ${role}\")\n", + " sys = IDEATION_PROMPT.format(role=role, focus=focus, **ctx.prompt_vars())\n", + " usr = f\"Design a protocol to {ctx.goal} within ${ctx.budget}.\"\n", + " idea = call_openai(ctx.client, MODEL_IDEATE, sys, usr, ctx)\n", + " ideas.append(idea)\n", + " log_json(\"ideation_done\", ideas, ctx)\n", + " return ideas" + ] + }, + { + "cell_type": "markdown", + "id": "0384e0d5", + "metadata": {}, + "source": [ + "The ideation agents can utilize external tools such as `literature_search`, `chem_lookup` (chemical database), `cost_estimator`, `outcome_db` (outcome of previous experiments) to ground their suggestions in data. Explicitly enabling and prompting models to use external tools ensures that generated plans are feasible, compliant, and informed by existing knowledge. The model decides when and which tool to call based on the task." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a8f365d8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting ideation phase...\n", + "Running ideation agent $hypothesis_agent\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) List available chemicals\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Outcome DB: XYZ-13, yield, 5\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Cost estimator: [{'name': 'Palladium acetate', 'amount': 0.2, 'unit': 'g'}, {'name': 'Triphenylphosphine', 'amount': 0.63, 'unit': 'g'}, {'name': 'Triethylamine', 'amount': 2.8, 'unit': 'mL'}, {'name': 'Dimethylformamide', 'amount': 10, 'unit': 'mL'}, {'name': 'Toluene', 'amount': 30, 'unit': 'mL'}, {'name': 'Methanol', 'amount': 20, 'unit': 'mL'}, {'name': 'Silica gel', 'amount': 50, 'unit': 'g'}], ['heating mantle', 'round-bottom flask', 'inert gas line', 'column chromatography setup'], 48\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "Running ideation agent $protocol_agent\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) List available chemicals\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Outcome DB: XYZ-13, None, 5\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Chemical lookup: Triphenylphosphine, None\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Chemical lookup: Triethylamine, None\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Cost estimator: [{'name': 'Palladium chloride', 'amount': 0.05, 'unit': 'g'}, {'name': 'Triphenylphosphine', 'amount': 0.5, 'unit': 'g'}, {'name': 'Potassium carbonate', 'amount': 1, 'unit': 'g'}, {'name': 'Dimethylformamide', 'amount': 10, 'unit': 'mL'}], ['reflux setup', 'inert gas line'], 24\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Chemical lookup: Sodium borohydride, None\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Cost estimator: [{'name': 'Palladium chloride', 'amount': 0.05, 'unit': 'g'}, {'name': 'Triphenylphosphine', 'amount': 0.5, 'unit': 'g'}, {'name': 'Potassium carbonate', 'amount': 1, 'unit': 'g'}, {'name': 'Dimethylformamide', 'amount': 10, 'unit': 'mL'}, {'name': 'Sodium borohydride', 'amount': 0.1, 'unit': 'g'}, {'name': 'Triethylamine', 'amount': 5, 'unit': 'mL'}], ['reflux setup', 'inert gas line'], 30\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "Running ideation agent $resource_agent\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) List available chemicals\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Outcome DB: XYZ-13, None, 5\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Cost estimator: [{'name': 'Palladium acetate', 'amount': 0.5, 'unit': 'g'}, {'name': 'Triphenylphosphine', 'amount': 1, 'unit': 'g'}, {'name': 'Potassium carbonate', 'amount': 5, 'unit': 'g'}, {'name': 'Dimethylformamide', 'amount': 50, 'unit': 'mL'}], ['Magnetic stirrer', 'Oil bath', 'Reflux condenser'], 36\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "Ideation complete!\n" + ] + } + ], + "source": [ + "IDEATION_PROMPT += \"\"\"\\nUse the following tools as appropriate:\n", + "- Use the `list_available_chemicals` tool to get list of approved reagents.\n", + "- Use the `chem_lookup` tool to verify properties of reagents mentioned.\n", + "- Use the `cost_estimator` tool to calculate the approximate cost based on reagents and proposed steps.\n", + "- Check the `outcome_db` for relevant prior experiments with {compound}\"\"\"\n", + "\n", + "ideas = ideation(ctx)\n", + "logging.info(\"Ideation complete!\")" + ] + }, + { + "cell_type": "markdown", + "id": "6f507348", + "metadata": {}, + "source": [ + "These tools are defined in `agent_utils.py`. For purposes of this solution, the tool calls are mocked in `tools.py`. In a real use case, these tools would call real APIs.\n", + "\n", + "\n", + "### 2.3. **Tournament Ranking (`o4-mini` / `o3`):** \n", + "Generated protocols are compared pairwise based on criteria like expected effectiveness, feasibility, cost, and novelty. Instead of asking a model to score protocols in isolation, providing two protocols at a time and asking for a direct comparison against specific criteria often yields more reliable relative rankings.\n", + "\n", + "This Elo-style ranking identifies the most promising candidates for deeper review." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f85fe4b7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting tournament phase...\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "Tournament winner picked!\n" + ] + } + ], + "source": [ + "TOURNAMENT_PROMPT = \"\"\"\n", + "Protocol A: [details...]\n", + "Protocol B: [details...]\n", + "\n", + "Compare Protocol A and Protocol B for synthesizing {compound} aimed at {goal}. Score them on:\n", + "1. Likelihood of achieving ≥ 15% yield increase.\n", + "2. Practical feasibility (reagents, time).\n", + "3. Estimated cost-efficiency (use tool if needed).\n", + "4. Scientific novelty/risk.\n", + "\n", + "Return JSON {{\\\"winner\\\": \\\"A\\\"|\\\"B\\\", \\\"justification\\\": \\\"...\\\"}}.\"\"\"\n", + "\n", + "# This is a mock tourname implementation that only compares the first two protocols\n", + "# A real implementation would compare pairs in a tournament bracket style\n", + "def tournament(protocols: List[Dict[str, Any]], ctx: Context):\n", + " logging.info(\"Starting tournament phase...\")\n", + " if len(protocols) == 1:\n", + " return protocols[:1]\n", + " a, b = protocols[0], protocols[1]\n", + " sys = TOURNAMENT_PROMPT.format(**ctx.prompt_vars())\n", + " usr = json.dumps({\"A\": a, \"B\": b}, indent=2)\n", + " res = call_openai(ctx.client, MODEL_IDEATE, sys, usr, ctx)\n", + " winner = a if res.get(\"winner\", \"A\").upper() == \"A\" else b\n", + " log_json(\"tournament\", res, ctx)\n", + " return [winner]\n", + "\n", + "top_proto = tournament(ideas, ctx)[0]\n", + "logging.info(\"Tournament winner picked!\")" + ] + }, + { + "cell_type": "markdown", + "id": "41ad4731", + "metadata": {}, + "source": [ + "> In early experiments, we found that asking models to score protocols on a 1-10 scale led to inconsistent results with score compression. The tournament approach solved this by forcing relative judgments that proved more reliable. This mirrors human expert behavior — scientists often find it easier to compare two options directly than to assign absolute scores.\n", + "\n", + "### 2.4. **Deep Critique & Synthesis (`o3`):** \n", + "The top-ranked protocols are passed to `o3` for rigorous review. `o3` acts like a senior scientist, assessing scientific validity, methodology, safety, budget compliance, and suggesting improvements or synthesizing a final, refined protocol. It may also call tools for verification." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "634ef4e2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting critique phase...\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Cost estimator: [{'name': 'Palladium chloride (PdCl2)', 'amount': 0.05, 'unit': 'g'}, {'name': 'Triphenylphosphine (PPh3)', 'amount': 0.5, 'unit': 'g'}, {'name': 'Sodium borohydride (NaBH4)', 'amount': 0.1, 'unit': 'g'}, {'name': 'Potassium carbonate (K2CO3)', 'amount': 1, 'unit': 'g'}, {'name': 'Dimethylformamide (DMF)', 'amount': 10, 'unit': 'mL'}, {'name': 'Toluene', 'amount': 10, 'unit': 'mL'}, {'name': 'Ethyl acetate', 'amount': 60, 'unit': 'mL'}, {'name': 'Magnesium sulfate', 'amount': 5, 'unit': 'g'}, {'name': 'Silica gel', 'amount': 50, 'unit': 'g'}, {'name': 'Hexanes', 'amount': 500, 'unit': 'mL'}, {'name': 'Ethyl acetate (chromatography)', 'amount': 500, 'unit': 'mL'}], ['Reflux apparatus', 'Schlenk line', 'Rotary evaporator', 'Flash column'], 32\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Outcome DB: XYZ-13, None, 5\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Cost estimator: [{'name': 'Palladium chloride (PdCl2)', 'amount': 0.05, 'unit': 'g'}, {'name': 'Triphenylphosphine (PPh3)', 'amount': 0.2, 'unit': 'g'}, {'name': 'Sodium borohydride (NaBH4)', 'amount': 0.08, 'unit': 'g'}, {'name': 'Potassium carbonate (K2CO3)', 'amount': 1.2, 'unit': 'g'}, {'name': 'Dimethylformamide (DMF)', 'amount': 7.5, 'unit': 'mL'}, {'name': 'Toluene', 'amount': 17.5, 'unit': 'mL'}, {'name': 'Ethyl acetate', 'amount': 100, 'unit': 'mL'}, {'name': 'Hexanes', 'amount': 100, 'unit': 'mL'}, {'name': 'Magnesium sulfate', 'amount': 5, 'unit': 'g'}], ['Reflux apparatus', 'Schlenk line', 'Rotary evaporator'], 24\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "Deep critique completed!\n" + ] + } + ], + "source": [ + "# Deep critique phase using a more powerful model for rigorous review\n", + "CRITIQUE_PROMPT = \"\"\"You are a senior researcher reviewing a proposed synthesis protocol \n", + "for {compound} aiming for {goal}, budget ${budget} using approved reagents. Review the protocol below rigorously:\n", + "1. Identify scientific flaws or methodological weaknesses.\n", + "2. Assess safety risks and budget compliance (use `cost_estimator` tool if needed).\n", + "3. Check for consistency with prior `outcome_db` results if relevant.\n", + "4. Suggest concrete improvements or rewrite sections if necessary.\n", + "5. Provide a final go/no-go recommendation.\n", + "\n", + "Return JSON {{\\\"revised_protocol\\\": ..., \\\"critique\\\": \\\"...\\\", \\\"recommendation\\\": \\\"go|no-go\\\"}}.\n", + "\n", + "Protocol to Review:\n", + "[Protocol details...]\n", + "\"\"\"\n", + "\n", + "MODEL_CRITIQUE = \"o3-2025-04-16\" # o3 model for deep critique\n", + "\n", + "def critique(protocol: Dict[str, Any], ctx: Context):\n", + " logging.info(\"Starting critique phase...\")\n", + " sys = CRITIQUE_PROMPT.format(**ctx.prompt_vars())\n", + " usr = json.dumps(protocol, indent=2)\n", + " crit = call_openai(ctx.client, MODEL_CRITIQUE, sys, usr, ctx)\n", + " log_json(\"critique\", crit, ctx)\n", + " return crit.get(\"revised_protocol\", protocol)\n", + "\n", + "critiqued = critique(top_proto, ctx)\n", + "logging.info(\"Deep critique completed!\")" + ] + }, + { + "cell_type": "markdown", + "id": "1fbd87a7", + "metadata": {}, + "source": [ + "> We deliberately separate ideation from critique using different models and personas. Having the same model both generate and critique its own work often leads to self-justification rather than objective assessment. The o3 model, acting as a \"senior scientist,\" consistently identified methodological weaknesses that o4-mini missed during ideation.\n", + "\n", + "### 2.5. **(Optional) Safety Check:** \n", + "A specialized model, such as `gpt-4.1-mini`, can perform a final check for specific safety concerns (e.g., hazardous reagent combos)." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "cc4405e4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting safety assessment...\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "(Tool) Chemical lookup: Palladium chloride, None\n", + "(Tool) Chemical lookup: Triphenylphosphine, None\n", + "(Tool) Chemical lookup: Sodium borohydride, None\n", + "(Tool) Chemical lookup: Potassium carbonate, None\n", + "(Tool) Chemical lookup: Dimethylformamide, None\n", + "(Tool) Chemical lookup: Toluene, None\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "Safety check completed!\n" + ] + } + ], + "source": [ + "# Optional safety check using a targeted model\n", + "SAFETY_PROMPT = \"\"\"You are a lab‑safety specialist. \n", + "Identify hazards, unsafe conditions, or compliance issues in this protocol for {compound}. \n", + "Use `chem_lookup` tool if needed. Return JSON assessment.\"\"\"\n", + "\n", + "MODEL_SAFETY = \"gpt-4.1-mini-2025-04-14\" # gpt-4.1-mini model for safety checks - optimized for instruction following\n", + "\n", + "def safety(protocol: Dict[str, Any], ctx: Context):\n", + " logging.info(\"Starting safety assessment...\")\n", + " sys = SAFETY_PROMPT.format(**ctx.prompt_vars())\n", + " usr = json.dumps(protocol, indent=2)\n", + " assessment = call_openai(ctx.client, MODEL_SAFETY, sys, usr, ctx)\n", + " log_json(\"safety\", assessment, ctx)\n", + " return {\"protocol\": protocol, \"safety\": assessment}\n", + "\n", + "secured = safety(critiqued, ctx)\n", + "logging.info(\"Safety check completed!\")" + ] + }, + { + "cell_type": "markdown", + "id": "9dd93396", + "metadata": {}, + "source": [ + "### 2.6. **Human Review:** \n", + "The AI-generated final plan is presented to the human scientist via an interface for validation, potential edits, and final approval." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e2d47339", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Awaiting human review...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== PROTOCOL FOR REVIEW: XYZ-13 - Improve synthesis yield by 15% ===\n", + "DETAILS: {\n", + " \"raw\": \"```json\\n{\\n \\\"objective\\\": \\\"Improve the batch\\u2010scale yield of XYZ-13 by \\u226515% (from ~62% to \\u226572%) within 48 h and under a total budget of $15 000\\\",\\n \\\"target_yield_percent\\\": 72,\\n \\\"budget\\\": 15000,\\n \\\"timeline\\\": \\\"48 hours\\\",\\n \\\"overview\\\": {\\n \\\"rationale\\\": [\\n \\\"Previous data show Pd\\u2010salt catalysis gives 58\\u201362% yield at 70\\u201385 \\u00b0C, but higher T leads to side products.\\\",\\n \\\"We propose in situ generation of a Pd(0)\\u2013Triphenylphosphine complex (improved activity) from PdCl\\u2082 + PPh\\u2083 + NaBH\\u2084.\\\",\\n \\\"Use a mixed solvent system (DMF/Toluene) and mild base to balance solubility and minimize side reactions.\\\"\\n ]\\n },\\n \\\"materials\\\": {\\n \\\"reagents\\\": [\\n { \\\"name\\\": \\\"Palladium chloride (PdCl\\u2082)\\\", \\\"amt\\\": 0.05, \\\"unit\\\": \\\"g\\\" },\\n { \\\"name\\\": \\\"Triphenylphosphine (PPh\\u2083)\\\", \\\"amt\\\": 0.50, \\\"unit\\\": \\\"g\\\" },\\n { \\\"name\\\": \\\"Sodium borohydride (NaBH\\u2084)\\\", \\\"amt\\\": 0.10, \\\"unit\\\": \\\"g\\\" },\\n { \\\"name\\\": \\\"Potassium carbonate (K\\u2082CO\\u2083)\\\", \\\"amt\\\": 1.00, \\\"unit\\\": \\\"g\\\" },\\n { \\\"name\\\": \\\"Dimethylformamide (DMF)\\\", \\\"amt\\\": 10.0, \\\"unit\\\": \\\"mL\\\" },\\n { \\\"name\\\": \\\"Toluene\\\", \\\"amt\\\": 10.0, \\\"unit\\\": \\\"mL\\\" }\\n ],\\n \\\"equipment\\\": [\\n \\\"Reflux apparatus with inert\\u2010gas inlet\\\",\\n \\\"Magnetic stirrer and hotplate\\\",\\n \\\"Schlenk line or glovebox for N\\u2082 purging\\\",\\n \\\"Filtration setup\\\",\\n \\\"Rotary evaporator\\\",\\n \\\"Analytical HPLC/GC or NMR for yield determination\\\"\\n ]\\n },\\n \\\"procedure\\\": [\\n {\\n \\\"step\\\": 1,\\n \\\"description\\\": \\\"In a dry 50 mL round\\u2010bottom flask under N\\u2082, dissolve PdCl\\u2082 (0.05 g) and PPh\\u2083 (0.50 g) in 5 mL DMF. Stir 10 min at room temperature to form Pd\\u2013PPh\\u2083 complex.\\\"\\n },\\n {\\n \\\"step\\\": 2,\\n \\\"description\\\": \\\"Add NaBH\\u2084 (0.10 g) portionwise over 5 min; gas evolution indicates Pd(0) generation. Stir additional 10 min under N\\u2082.\\\"\\n },\\n {\\n \\\"step\\\": 3,\\n \\\"description\\\": \\\"Add K\\u2082CO\\u2083 (1.00 g), then charge with a 1:1 DMF/Toluene mixture (total 20 mL). Purge the headspace with N\\u2082 for 5 min.\\\"\\n },\\n {\\n \\\"step\\\": 4,\\n \\\"description\\\": \\\"Charge in your substrate XYZ-13 (stoichiometry as per standard scale). Heat to 70 \\u00b0C and stir for 24 h under N\\u2082.\\\"\\n },\\n {\\n \\\"step\\\": 5,\\n \\\"description\\\": \\\"Allow to cool to room temperature, quench with water (20 mL), extract with ethyl acetate (3\\u00d720 mL), dry organic layers over MgSO\\u2084, filter and concentrate.\\\"\\n },\\n {\\n \\\"step\\\": 6,\\n \\\"description\\\": \\\"Purify by flash chromatography (silica gel, gradient hexanes/EtOAc). Collect and weigh product.\\\"\\n }\\n ],\\n \\\"analysis_and_monitoring\\\": [\\n \\\"Monitor reaction aliquots by HPLC or GC every 6 h to track conversion and by\\u2010product formation.\\\",\\n \\\"Characterize isolated product by ^1H NMR and mass spectrometry.\\\",\\n \\\"Compare isolated yield against control reaction (no NaBH\\u2084) to quantify improvement.\\\"\\n ],\\n \\\"anticipated_results\\\": {\\n \\\"expected_yield_percent\\\": \\\"\\u226572%\\\",\\n \\\"byproducts\\\": \\\"Minimal side products based on milder T and ligand stabilization of Pd(0).\\\"\\n },\\n \\\"safety_and_waste\\\": [\\n \\\"All operations under inert atmosphere; handle NaBH\\u2084 carefully (water\\u2010reactive).\\\",\\n \\\"DMF and toluene are toxic/flammable\\u2014use in fume hood.\\\",\\n \\\"Collect filtrates and washes for solvent recycle or disposal per institutional protocols.\\\"\\n ],\\n \\\"cost_estimate_usd\\\": 2421.28,\\n \\\"notes\\\": [\\n \\\"Total reagent+equipment+labor cost ~\\\\$2.4 k, well under budget for multiple parallel runs or scale\\u2010up.\\\",\\n \\\"Procedure can be completed within 48 h including analysis.\\\"\\n ]\\n}\\n```\"\n", + "}\n", + "SAFETY: {\n", + " \"hazards_and_safety_issues\": [\n", + " {\n", + " \"chemical\": \"Sodium borohydride\",\n", + " \"issue\": \"Water-reactive causing gas evolution, flammable\",\n", + " \"protocol_note\": \"Handle carefully under inert atmosphere; ensure controlled addition to avoid rapid gas release and pressure buildup.\"\n", + " },\n", + " {\n", + " \"chemical\": \"Dimethylformamide\",\n", + " \"issue\": \"Reproductive toxin and flammable solvent\",\n", + " \"protocol_note\": \"Must be used in a well-ventilated fume hood; avoid skin contact and inhalation.\"\n", + " },\n", + " {\n", + " \"chemical\": \"Toluene\",\n", + " \"issue\": \"Flammable and central nervous system depressant\",\n", + " \"protocol_note\": \"Use in fume hood with proper grounding and avoid ignition sources.\"\n", + " },\n", + " {\n", + " \"chemical\": \"Palladium chloride\",\n", + " \"issue\": \"Irritant and potential carcinogen\",\n", + " \"protocol_note\": \"Use personal protective equipment (PPE) including gloves and lab coat; avoid inhalation and skin contact.\"\n", + " },\n", + " {\n", + " \"chemical\": \"Triphenylphosphine\",\n", + " \"issue\": \"Irritant\",\n", + " \"protocol_note\": \"Use proper PPE; avoid inhalation and contact.\"\n", + " },\n", + " {\n", + " \"chemical\": \"Potassium carbonate\",\n", + " \"issue\": \"Irritant\",\n", + " \"protocol_note\": \"Handle with gloves and eye protection.\"\n", + " }\n", + " ],\n", + " \"unsafe_conditions\": [\n", + " {\n", + " \"condition\": \"Gas evolution during addition of sodium borohydride\",\n", + " \"recommendation\": \"Conduct addition slowly with proper venting to prevent pressure buildup and risk of splashing or inhalation.\"\n", + " },\n", + " {\n", + " \"condition\": \"Use of flammable solvents (DMF, toluene) at elevated temperature (70\\u00b0C)\",\n", + " \"recommendation\": \"Ensure apparatus is properly grounded and flame sources are controlled; perform procedure in fume hood to avoid accumulation of vapors.\"\n", + " }\n", + " ],\n", + " \"compliance_issues\": [\n", + " {\n", + " \"issue\": \"Waste handling\",\n", + " \"recommendation\": \"Filtrates and solvent washes contain toxic and flammable chemicals; must be collected and disposed following institutional hazardous waste protocols.\"\n", + " },\n", + " {\n", + " \"issue\": \"Inert atmosphere handling\",\n", + " \"recommendation\": \"Ensure Schlenk line or glovebox is properly maintained and checked for nitrogen purity to prevent oxidation or hazardous side reactions.\"\n", + " }\n", + " ],\n", + " \"general_recommendations\": [\n", + " \"Confirm proper training and availability of PPE to all personnel.\",\n", + " \"Have emergency response measures ready in case of fire or chemical exposure.\",\n", + " \"Monitor reaction aliquots carefully to detect unexpected side reactions early.\"\n", + " ]\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Protocol approved\n" + ] + } + ], + "source": [ + "def human_review(safety_package: Dict[str, Any], ctx: Context):\n", + " logging.info(\"Awaiting human review...\")\n", + " protocol = safety_package[\"protocol\"]\n", + " safety_assessment = safety_package[\"safety\"]\n", + " \n", + " print(f\"\\n=== PROTOCOL FOR REVIEW: {ctx.compound} - {ctx.goal} ===\")\n", + " print(f\"DETAILS: {json.dumps(protocol, indent=2)}\")\n", + " print(f\"SAFETY: {json.dumps(safety_assessment, indent=2)}\")\n", + " \n", + " while True:\n", + " approval = input(\"\\nApprove for execution? (yes/no): \").lower()\n", + " if approval in ['yes', 'y', 'no', 'n']:\n", + " approved = approval in ['yes', 'y']\n", + " logging.info(f\"Protocol {'approved' if approved else 'rejected'}\")\n", + " return {\"protocol\": protocol, \"approved\": approved}\n", + " print(\"Please enter 'yes' or 'no'\")\n", + "\n", + "human_decision = human_review(secured, ctx)" + ] + }, + { + "cell_type": "markdown", + "id": "e51e598b", + "metadata": {}, + "source": [ + "### 2.7. **Execution & Learning (`o3` + Code Interpreter):** \n", + "Once the human approves, the plan is sent for lab execution. After lab execution, results are fed back into the system. `o3` combined with the `Code Interpreter` analyzes the data, generates insights, and stores structured outcomes (protocol, parameters, results, insights) in a database (`Outcome DB`). This database informs future ideation cycles, creating a learning loop." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "3894d1b3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting mock execution and analysis...\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "Analysis complete\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "🎉 Completed. Summary written to output/fb282d6f_summary.json\n" + ] + } + ], + "source": [ + "# Simulating execution and analyzing results\n", + "ANALYSIS_PROMPT = \"\"\"You are a data analyst. \n", + "Did the experiment achieve {goal}? Analyse factors, suggest improvements, and return structured JSON.\n", + "\"\"\"\n", + "\n", + "def execute_and_analyse(pkt: Dict[str, Any], ctx: Context):\n", + " logging.info(\"Starting mock execution and analysis...\")\n", + " # These are mock results for a lab experiment\n", + " mock_results = {\n", + " \"yield_improvement\": 12.5,\n", + " \"success\": False,\n", + " \"actual_cost\": ctx.budget * 0.85,\n", + " \"notes\": \"Mock execution\"\n", + " }\n", + " sys = ANALYSIS_PROMPT.format(**ctx.prompt_vars())\n", + " usr = json.dumps({\"protocol\": pkt, \"results\": mock_results}, indent=2)\n", + " analysis = call_openai(ctx.client, MODEL_CRITIQUE, sys, usr, ctx)\n", + " log_json(\"analysis\", analysis, ctx)\n", + " return analysis\n", + "\n", + "# Only proceed to execution if approved by the human reviewer\n", + "if human_decision[\"approved\"]:\n", + " summary = execute_and_analyse(human_decision, ctx)\n", + " logging.info(\"Analysis complete\")\n", + "else:\n", + " logging.info(\"Protocol rejected by human reviewer - execution skipped\")\n", + " summary = None\n", + "\n", + "Path(\"output\").mkdir(exist_ok=True)\n", + "out_path = Path(\"output\") / f\"{ctx.run_id}_summary.json\"\n", + "out_path.write_text(json.dumps(summary, indent=2))\n", + "print(f\"\\n🎉 Completed. Summary written to {out_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2f4ecb9f", + "metadata": {}, + "source": [ + "## 3. Model Playbook\n", + "\n", + "Choosing between `o4-mini` and `o3` depends on the task's complexity and required depth. For other tasks, `gpt-4.1-mini` provides balance between cost and performance, with the more powerful `gpt4.1` recommended when greater capability or nuance is needed.\n", + "\n", + "| Task | Start With | Upgrade When... | Escalate To | Rationale |\n", + "| :----------------- | :------------- | :--------------------------------------------------------- | :----------- | :------------------------------------------------------------------------------------------- |\n", + "| Ideation & Protocol Generation | `o4-mini` | Hypotheses lack depth or creativity needed for complex chemical synthesis. | `o3` | `o4-mini` rapidly generates diverse protocols cost-effectively. `o3` provides deeper scientific reasoning when more nuanced approaches are required. |\n", + "| Protocol Ranking | `o4-mini` | Comparison requires deeper scientific assessment or multi-factor trade-offs. | `o3` | Tournament-style ranking with `o4-mini` efficiently identifies promising candidates. Escalate when subtle scientific validity needs evaluation. |\n", + "| Deep Critique & Synthesis | `o3` | N/A - Already using the most capable model for this critical task. | N/A | `o3` excels at rigorous scientific review, identifying methodological flaws, and synthesizing improvements across complex protocols. This task inherently requires deep reasoning. |\n", + "| Safety Assessment | `gpt-4.1-mini` | Domain-specific hazards require higher accuracy or specialized knowledge. | `gpt-4.1` | `gpt-4.1-mini` offers a good balance of cost and performance for standard safety checks. Escalate to `gpt4.1` when higher accuracy or more nuanced reasoning is needed for complex safety risks. |\n", + "\n", + "**Key Insight:**\n", + "> This use case exemplifies a powerful pattern: using faster, cheaper models (`o4-mini`) for breadth and initial filtering, then escalating to more powerful models (`o3`) for depth, critical review, and synthesis. This layered approach optimizes for both creativity/speed and rigor/accuracy, while managing computational costs effectively. The integration with tools is essential for grounding the AI's reasoning in verifiable, real-world data.\n", + "\n", + "## 4. Deployment Notes\n", + "\n", + "Transitioning the AI Co-Scientist from prototype to lab use involves careful planning.\n", + "\n", + "* **Cost Control:**\n", + " * Implement configurable \"modes\" (such as `Fast`, `Standard`, `Thorough`) that adjust the number of `o4-mini` ideation agents, the depth of `o3` critique, or the use of optional checks to balance result quality with cost and latency.\n", + " * Track token usage per stage (ideation, ranking, critique) and per tool call for fine-grained cost monitoring.\n", + "* **Observability:**\n", + " * Log inputs, outputs, model choices, tool calls/responses, latencies, and token counts for each step.\n", + " * Monitor the performance of the tournament ranking and the impact of `o3` critiques (such as how often plans are significantly altered or rejected).\n", + " * Track user interactions: which plans are approved, edited, or rejected by the human scientist.\n", + "* **Safety & Compliance:**\n", + " * Implement multiple safety layers: constraints in prompts, tool-based checks (such as reagent compatibility via `chem_lookup`), optional dedicated model checks (`gpt-4.1-mini`), automated filters (such as for known hazardous combinations), and mandatory human review.\n", + " * Ensure tool endpoints (such as internal databases) meet security requirements.\n", + "* **Rollout Strategy:** \n", + " * Begin with retrospective analysis of past experiments, then move to shadow mode (AI suggests plans alongside human planners), followed by limited live use cases with close monitoring before broader adoption.\n", + "\n", + "\n", + "## 5. Takeaways\n", + "\n", + "1. **Model pairing creates synergy**: `o4-mini` covers more ground quickly; `o3` brings precision and depth.\n", + "2. **Tool integration grounds reasoning in reality**: Real-world data such as chemical costs and safety constraints inform decision-making.\n", + "3. **Human scientists remain central**: The system empowers experts by removing grunt work—not by replacing them.\n", + "\n", + "\n", + "## 6. Useful Cookbooks & Resources\n", + "\n", + "Here are select resources that complement the design and implementation of the AI Co-Scientist system:\n", + "\n", + "- **[Orchestrating Agents: Routines and Handoffs](https://cookbook.openai.com/examples/orchestrating_agents)** Structuring multi-agent workflows with routines and handoffs, relevant to the ideation→ranking→critique pipeline.\n", + "\n", + "- **[GPT-4.1 Prompting Guide](https://cookbook.openai.com/examples/gpt4-1_prompting_guide)** Advanced prompting, tool use, and task decomposition for improved accuracy in critique and safety reviews.\n", + "\n", + "- **[Structured Outputs for Multi-Agent Systems](https://cookbook.openai.com/examples/structured_outputs_multi_agent)** Enforcing consistent JSON outputs with schema validation for agent interoperability.\n", + "\n", + "- **[Agents - OpenAI API](https://platform.openai.com/docs/guides/agents)** \n", + " Comprehensive guide to building multi-agent systems with OpenAI tools, covering orchestration, tool use, and best practices foundational to this system's architecture.\n", + "\n", + "================================================================================\n", + "\n", + "\n", + "\n", + "## 3C. Use Case: Insurance Claim Processing\n", + "\n", + "![](./images/3C_insurance_task_card.png)\n", + "\n", + "Many businesses are faced with the task of digitizing hand filled forms. In this section, we will demonstrate how OpenAI can be used to digitize and validate a hand filled insurance form. While this is a common problem for insurance, the same techniques can be applied to a variety of other industries and forms, for example tax forms, invoices, and more.\n", + "\n", + "## 🗂️ TL;DR Matrix\n", + "\n", + "This table summarizes the core technology choices and their rationale for this specific OCR implementation targeting the insurance use case.\n", + "\n", + "| Layer | Choice | Utility |\n", + "| :---- | :---- | :---- |\n", + "| JSON Output | Structured output with Pydantic | Easy to specify formatting, adheres to schema better than `JSON mode` |\n", + "| OCR and Vision | `gpt-4.1` | Powerful OCR and vision capabilities, structured output |\n", + "| Reasoning | `o4-mini` | Affordable but capable reasoning, function calling available |\n", + "| Form Validation | Custom function calling | Can provide interaction with custom or internal databases |\n", + "\n", + "\\*Note: Prices and model identifiers accurate as of April 2025, subject to change.\n", + "\n", + "## 1\\. Scenario Snapshot\n", + "\n", + "* **Users:** The target users are insurance servicing and ops teams who need to ingest data from handwritten forms. \n", + "* **Typical Asks:** Each form will have a different required structure, as well as different fields that need to be extracted. \n", + "* **Constraints:** \n", + " * **Accuracy:** High accuracy is required to ensure that the data is correct and complete. \n", + " * **Uncertainty:** The system must handle uncertainty in the data, such as missing data, ambiguous data, and different formats of the same field. In the event that the model cannot resolve the uncertainty, the system requires a mechanism to request human review. \n", + " * **Performance & Cost:** While system latency is not critical, high accuracy is required while keeping costs under control. We will aim for a cost target of $20 or less per 1000 pages processed.\n", + "\n", + "## 2\\. Architecture\n", + "\n", + "The high level basic architecture of the solution is shown below.\n", + "\n", + "![](./images/3C_insurance_architecture.png)\n", + "\n", + "This task is complex and requires a wide variety of model capabilities, including vision, function calling, reasoning, and structured output. While `o3` is capable of doing all of these at once, we found during experimentation that `o4-mini` alone was not sufficient to achieve the necessary performance. Due to the higher relative costs of `o3`, we instead opted for a two-stage approach.\n", + "\n", + "1. Stage one is performed using the vision capabilities of GPT 4.1. This stage is optimized to extract text with maximum accuracy, leaving uncertainty for the reasoning stage and not making any assumptions not visible on the page. By doing OCR in the first stage, we do not require the reasoning model to work directly from an image, which can be challenging given all the other tasks the reasoning model must perform. \n", + " \n", + "2. Stage two takes advantage of the reasoning abilities of `o4-mini`. We use `o4-mini` to validate the accuracy of the OCR and to extract the data into a structured format. Importantly, we expect o4-mini to act as the secondary quality gate \\-- if the OCR is incomplete at this stage we can use o4-mini to refine and validate the original results.\n", + "\n", + "To demonstrate concretely how this works, let's look at a sample image of an insurance form.\n", + "\n", + "![](./images/3C_insurance_form.png)\n", + "\n", + "While the form itself is fairly straightforward, there is missing data and ambiguous information that will be difficult for a traditional OCR system to fill out correctly. First, notice that the zip code and county have been omitted. Second, the email address of the user is ambiguous \\-- it could be `jsmith1@gmail.com` or `jsmithl@gmail.com`. In the following sections, we will walk through how a well-designed solution can handle these ambiguities and return the correct form results.\n", + "\n", + "**Environment Setup & Library Code:**\n", + "\n", + "To make our example code more clear, we have broken out environment setup (such as `pip install` commands) and library functions into a separate code block. This will make it easier to focus on only the relevant logic in each step of our solution." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "923344db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# Install Python requirements\n", + "%pip install -qU pydantic \"openai>=1.76.0\"\n", + "\n", + "# All imports\n", + "import os\n", + "import json\n", + "\n", + "from pydantic import BaseModel\n", + "\n", + "# Create the OpenAI client\n", + "from openai import OpenAI\n", + "\n", + "client = OpenAI(api_key=os.environ.get(\"OPENAI_API_KEY\", \"sk-dummykey\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7ccd93f6", + "metadata": {}, + "outputs": [], + "source": [ + "def run_conversation_loop(\n", + " client,\n", + " messages,\n", + " tools,\n", + " tool_handlers,\n", + " response_format,\n", + " model,\n", + "):\n", + " \"\"\"Run the OpenAI response completion loop, handling function calls via tool_handlers until parsing final response.\"\"\"\n", + " summaries = []\n", + " while True:\n", + " print(\n", + " f\"Requesting completion from model '{model}' (messages={len(messages)})\"\n", + " )\n", + " response = client.responses.parse(\n", + " model=model,\n", + " input=messages,\n", + " tools=tools,\n", + " text_format=response_format,\n", + " reasoning={\"summary\": \"auto\"},\n", + " )\n", + " summaries.append(response.output[0].summary)\n", + "\n", + " if not response.output_parsed:\n", + " print(\"Assistant requested tool calls, resolving ...\")\n", + "\n", + " reasoning_msg, tool_call = response.output\n", + " messages.append(reasoning_msg)\n", + " messages.append({\n", + " \"id\": tool_call.id,\n", + " \"call_id\": tool_call.call_id,\n", + " \"type\": tool_call.type,\n", + " \"name\": tool_call.name,\n", + " \"arguments\": tool_call.arguments,\n", + " })\n", + "\n", + " if tool_call.name in tool_handlers:\n", + " try:\n", + " args = json.loads(tool_call.arguments)\n", + " except Exception as exc:\n", + " print(\n", + " \"Failed to parse %s arguments: %s\", tool_call.name, exc\n", + " )\n", + " args = {}\n", + " result = tool_handlers[tool_call.name](**args)\n", + " messages.append(\n", + " {\n", + " \"type\": \"function_call_output\",\n", + " \"call_id\": tool_call.call_id,\n", + " \"output\": str(result),\n", + " }\n", + " )\n", + " print(f\"Tool call {tool_call.name} complete, result: {str(result)}\")\n", + " else:\n", + " print(\"Unhandled function call: %s\", tool_call.name)\n", + "\n", + " if response.output_parsed is not None:\n", + " print(\"Received parsed result from model\")\n", + " return response, summaries" + ] + }, + { + "cell_type": "markdown", + "id": "76755e0d", + "metadata": {}, + "source": [ + "**Flow Explanation: Stage 1**\n", + "\n", + "1. **Image:** The image of the form taken from the user's smartphone is passed to the model. OpenAI's models can accept a variety of image formats, but we typically use a PNG format to keep the text crisp and reduce artifacts. For this example, we pass the image to the model from a publically available content URL. In a production environment, you likely would pass the image as a signed URL to an image hosted in your own cloud storage bucket. \n", + " \n", + "2. **Structured Output Schema:** We define a Pydantic model that sets the structure of the output data. The model includes all of the fields that we need to extract from the form, along with the appropriate types for each field. Our model is broken into several subcomponents, each of which is a Pydantic model itself and referenced by the parent model." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "59263ec9", + "metadata": {}, + "outputs": [], + "source": [ + "class PersonContact(BaseModel):\n", + " name: str\n", + " home_phone: str\n", + " work_phone: str\n", + " cell_phone: str\n", + " email: str\n", + "\n", + "class Address(BaseModel):\n", + " street: str\n", + " city: str\n", + " state: str\n", + " zip: str\n", + " county: str\n", + "\n", + "class DwellingDetails(BaseModel):\n", + " coverage_a_limit: str\n", + " compantion_policy_expiration_date: str\n", + " occupancy_of_dwelling: str\n", + " type_of_policy: str\n", + " unrepaired_structural_damage: bool\n", + " construction_type: str\n", + " roof_type: str\n", + " foundation_type: str\n", + " has_post_and_pier_or_post_and_beam_foundation: bool\n", + " cripple_walls: bool\n", + " number_of_stories: str\n", + " living_space_over_garage: bool\n", + " number_of_chimneys: str\n", + " square_footage: str\n", + " year_of_construction: str\n", + " anchored_to_foundation: bool\n", + " water_heater_secured: bool\n", + "\n", + "class InsuranceFormData(BaseModel):\n", + " applicant: PersonContact\n", + " co_applicant: PersonContact\n", + " risk_address: Address\n", + " mailing_address_if_different_than_risk_address: Address\n", + " participating_insurer: str\n", + " companion_policy_number: str\n", + " dwelling_details: DwellingDetails\n", + " effective_date: str\n", + " expiration_date: str" + ] + }, + { + "cell_type": "markdown", + "id": "70e746a3", + "metadata": {}, + "source": [ + "3. **Run OCR:** Using the vision capabilities of GPT-4.1, we run the first stage of our pipeline to extract the text from the document in a structured format. This initial stage aims to achieve high accuracy while passing through uncertainty to the second stage. Our prompt explicitly instructs the model to avoid inferring inputs and instead to fill out the details as exact as possible. For the image input, we set image input detail to `auto` to infer a detail level that's appropriate to the image. We found in our experiments that `auto` worked well, but if you are seeing quality issues in your OCR processing considering using `high`." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "1537dad2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "HTTP Request: POST https://api.openai.com/v1/responses \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"applicant\": {\n", + " \"name\": \"Smith, James L\",\n", + " \"home_phone\": \"510 331 5555\",\n", + " \"work_phone\": \"\",\n", + " \"cell_phone\": \"510 212 5555\",\n", + " \"email\": \"jsmithl@gmail.com OR jsmith1@gmail.com\"\n", + " },\n", + " \"co_applicant\": {\n", + " \"name\": \"Roberts, Jesse T\",\n", + " \"home_phone\": \"510 331 5555\",\n", + " \"work_phone\": \"415 626 5555\",\n", + " \"cell_phone\": \"\",\n", + " \"email\": \"jrobertsjr@gmail.com\"\n", + " },\n", + " \"risk_address\": {\n", + " \"street\": \"855 Brannan St\",\n", + " \"city\": \"San Francisco\",\n", + " \"state\": \"CA\",\n", + " \"zip\": \"\",\n", + " \"county\": \"\"\n", + " },\n", + " \"mailing_address_if_different_than_risk_address\": {\n", + " \"street\": \"\",\n", + " \"city\": \"\",\n", + " \"state\": \"\",\n", + " \"zip\": \"\",\n", + " \"county\": \"\"\n", + " },\n", + " \"participating_insurer\": \"Acme Insurance Co\",\n", + " \"companion_policy_number\": \"81265919\",\n", + " \"dwelling_details\": {\n", + " \"coverage_a_limit\": \"$900,000\",\n", + " \"compantion_policy_expiration_date\": \"5/31/27\",\n", + " \"occupancy_of_dwelling\": \"Owner\",\n", + " \"type_of_policy\": \"Homeowners\",\n", + " \"unrepaired_structural_damage\": false,\n", + " \"construction_type\": \"Frame\",\n", + " \"roof_type\": \"Composition\",\n", + " \"foundation_type\": \"Raised\",\n", + " \"has_post_and_pier_or_post_and_beam_foundation\": false,\n", + " \"cripple_walls\": false,\n", + " \"number_of_stories\": \"Greater than 1 story\",\n", + " \"living_space_over_garage\": true,\n", + " \"number_of_chimneys\": \"2\",\n", + " \"square_footage\": \"1200\",\n", + " \"year_of_construction\": \"2005\",\n", + " \"anchored_to_foundation\": true,\n", + " \"water_heater_secured\": true\n", + " },\n", + " \"effective_date\": \"5/31/25\",\n", + " \"expiration_date\": \"5/31/27\"\n", + "}\n" + ] + } + ], + "source": [ + "OCR_PROMPT = \"\"\"You are a helpful assistant who excels at processing insurance forms.\n", + "\n", + "You will be given an image of a hand-filled insurance form. Your job is to OCR the data into the given structured format.\n", + "Fill out the fields as exactly as possible. If a written character could possibly be ambigious (i.e. l or 1, o or 0), include all possiblities in the field separated by \"OR\", especially for email addresses.\n", + "\"\"\"\n", + "\n", + "user_content = [\n", + " {\"type\": \"input_text\", \"text\": \"Here is a photo of the form filled out by the user:\"},\n", + " {\n", + " \"type\": \"input_image\",\n", + " \"image_url\": \"https://drive.usercontent.google.com/download?id=1-tZ526AW3mX1qthvgi8spaaxxeqFG5_6\",\n", + " \"detail\": \"auto\",\n", + " },\n", + "]\n", + "\n", + "messages = [\n", + " {\"role\": \"system\", \"content\": OCR_PROMPT},\n", + " {\"role\": \"user\", \"content\": user_content},\n", + "]\n", + "\n", + "response = client.responses.parse(\n", + " model=\"gpt-4.1-2025-04-14\",\n", + " input=messages,\n", + " text_format=InsuranceFormData,\n", + " # Set temp to 0 for reproducibility\n", + " temperature=0,\n", + ")\n", + "\n", + "s1_json_results = json.dumps(json.loads(response.output_parsed.model_dump_json()), indent=2)\n", + "print(s1_json_results)" + ] + }, + { + "cell_type": "markdown", + "id": "42296380", + "metadata": {}, + "source": [ + "Notice that the output is missing several fields. In the next stage of processing we will take advantage of OpenAI's reasoning models to infer the missing fields where possible.\n", + "\n", + "**Flow Explanation: Stage 2**\n", + "\n", + "1. **Function Definitions:** We define a set of custom functions that the model can use to resolve uncertainty. In this case, we define a function that can validate email addresses by checking if the email exists. This can be used to resolve the ambiguous email address field where the model must choose between multiple possible values. By default, o4-mini supports built-in tools like web search, which in this case it will use to resolve zip codes and incomplete addresses." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "72dc150e", + "metadata": {}, + "outputs": [], + "source": [ + "tools = [{\n", + " \"type\": \"function\",\n", + " \"name\": \"validate_email\",\n", + " \"description\": \"Check if an email address is valid and exists.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"email\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The email address to validate.\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"email\"\n", + " ],\n", + " \"additionalProperties\": False\n", + " }\n", + "},\n", + "{\n", + " \"type\": \"function\",\n", + " \"name\": \"search_web\",\n", + " \"description\": \"Perform a web search.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"query\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The search query to run through the search engine.\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"query\"\n", + " ],\n", + " \"additionalProperties\": False\n", + " }\n", + "}]" + ] + }, + { + "cell_type": "markdown", + "id": "f9a9b808", + "metadata": {}, + "source": [ + "2. **Prompt:** We provide a prompt to the model explaining that we have extracted text via OCR and requesting that the model perform reasoning and function calling to fill in the missing or ambiguous fields." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "ae8fcf6d", + "metadata": {}, + "outputs": [], + "source": [ + "PROMPT = \"\"\"You are a helpful assistant who excels at processing insurance forms.\n", + "\n", + "You will be given a javascript representation of an OCR'd document. Consider at which fields are ambigious reason about how to fill them in. Fill any missing fields that are possible to infer from existing data, or search the web. If you cannot fill a field, reason about why.\n", + "\n", + "Use the tools provided if necessary to clarify the results. If the OCR system has provided two possibilities, do your best to definitely pick which option is correct.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "1d2b77ee", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requesting completion from model 'o4-mini-2025-04-16' (messages=2)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "HTTP Request: POST https://api.openai.com/v1/responses \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant requested tool calls, resolving ...\n", + "Tool call validate_email complete, result: True\n", + "Requesting completion from model 'o4-mini-2025-04-16' (messages=5)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "HTTP Request: POST https://api.openai.com/v1/responses \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Received parsed result from model\n", + "{\n", + " \"applicant\": {\n", + " \"name\": \"Smith, James L\",\n", + " \"home_phone\": \"510 331 5555\",\n", + " \"work_phone\": \"\",\n", + " \"cell_phone\": \"510 212 5555\",\n", + " \"email\": \"jsmithl@gmail.com\"\n", + " },\n", + " \"co_applicant\": {\n", + " \"name\": \"Roberts, Jesse T\",\n", + " \"home_phone\": \"510 331 5555\",\n", + " \"work_phone\": \"415 626 5555\",\n", + " \"cell_phone\": \"\",\n", + " \"email\": \"jrobertsjr@gmail.com\"\n", + " },\n", + " \"risk_address\": {\n", + " \"street\": \"855 Brannan St\",\n", + " \"city\": \"San Francisco\",\n", + " \"state\": \"CA\",\n", + " \"zip\": \"94107\",\n", + " \"county\": \"San Francisco\"\n", + " },\n", + " \"mailing_address_if_different_than_risk_address\": {\n", + " \"street\": \"\",\n", + " \"city\": \"\",\n", + " \"state\": \"\",\n", + " \"zip\": \"\",\n", + " \"county\": \"\"\n", + " },\n", + " \"participating_insurer\": \"Acme Insurance Co\",\n", + " \"companion_policy_number\": \"81265919\",\n", + " \"dwelling_details\": {\n", + " \"coverage_a_limit\": \"$900,000\",\n", + " \"compantion_policy_expiration_date\": \"5/31/27\",\n", + " \"occupancy_of_dwelling\": \"Owner\",\n", + " \"type_of_policy\": \"Homeowners\",\n", + " \"unrepaired_structural_damage\": false,\n", + " \"construction_type\": \"Frame\",\n", + " \"roof_type\": \"Composition\",\n", + " \"foundation_type\": \"Raised\",\n", + " \"has_post_and_pier_or_post_and_beam_foundation\": false,\n", + " \"cripple_walls\": false,\n", + " \"number_of_stories\": \"Greater than 1 story\",\n", + " \"living_space_over_garage\": true,\n", + " \"number_of_chimneys\": \"2\",\n", + " \"square_footage\": \"1200\",\n", + " \"year_of_construction\": \"2005\",\n", + " \"anchored_to_foundation\": true,\n", + " \"water_heater_secured\": true\n", + " },\n", + " \"effective_date\": \"5/31/25\",\n", + " \"expiration_date\": \"5/31/27\"\n", + "}\n" + ] + } + ], + "source": [ + "messages = [\n", + " {\"role\": \"system\", \"content\": PROMPT},\n", + " {\"role\": \"user\", \"content\": s1_json_results},\n", + "]\n", + "\n", + "# For demonstration purposes, we'll hardcode the correct email answer.\n", + "def email_mock(*args, **kwargs):\n", + " if kwargs[\"email\"] == \"jsmithl@gmail.com\":\n", + " return True\n", + " return False\n", + "\n", + "# Reasoning models like `o4-mini` will soon support built-in web search, but for now\n", + "# we demonstrate this capability using a simple mock function.\n", + "def web_mock(*args, **kwargs):\n", + " if \"855 Brannan\" in kwargs[\"query\"]:\n", + " return \"855 Brannan St, San Francisco, 94103, San Francisco County\"\n", + " \n", + " return \"\"\n", + " \n", + "tool_handlers = {\"validate_email\": email_mock, \"search_web\": web_mock}\n", + "\n", + "response, summaries = run_conversation_loop(\n", + " client=client,\n", + " messages=messages,\n", + " tools=tools,\n", + " tool_handlers=tool_handlers,\n", + " response_format=InsuranceFormData,\n", + " model=\"o4-mini-2025-04-16\",\n", + ")\n", + "\n", + "print(json.dumps(json.loads(response.output_parsed.model_dump_json()), indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "cb3f3115", + "metadata": {}, + "source": [ + "You can see that the email address has been refined to a single value, the zip code and county have been filled in, and the mailing address has been filled in by using the risk address. The model has also returned the results in a structured format (with appropriate types such as boolean for yes/no questions), which can be easily parsed by a downstream system.\n", + "\n", + "To help us understand and debug the model, we can also print the summary chain-of-thought reasoning produced by the model. This can help expose common failure modes, points where the model is unclear, or incorrect upstream details.\n", + "\n", + "While developing this solution the chain-of-thought summaries exposed some incorrectly named and typed schema values." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "ab1d4fbc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "**Verifying address details**\n", + "\n", + "I need to validate the email and fill out the JSON correctly. I noticed that the zip code and county are missing for the address 855 Brannan St, San Francisco, CA. Based on the location, the zip code should be 94107, and it's in San Francisco County. Since the mailing address is the same, I’ll leave that blank. Also, I've checked the expiration date, which is noted as 5/31/27, and the co-applicant's cell phone is blank, so I can't fill that in either.\n", + "\n", + "**Finalizing JSON details**\n", + "\n", + "I confirmed the co-applicant's home phone is the same as the applicant's, which is 510-331-5555. For the email, I'll validate either jsmithl@gmail.com or jsmith1@gmail.com, but I think jsmithl is more likely correct because \"l\" likely stands for the last initial of James L. I've decided the risk address zip is 94107 and the county is San Francisco. \n", + "\n", + "I'll prepare the final JSON format, leaving the mailing address blank since it is not different from the risk address.\n", + "\n" + ] + } + ], + "source": [ + "for summary in summaries:\n", + " for response in summary:\n", + " print(response.text + '\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "f2bd52eb", + "metadata": {}, + "source": [ + "## 3\\. Model and Capabilities Playbook\n", + "\n", + "Selecting the right tool for the job is key to getting the best results. In general, it's a good idea to start with the simplest solution that fits your needs and then upgrade if you need more capabilities.\n", + "\n", + "| Task | Start With | Upgrade When... | Escalate To | Rationale |\n", + "| :---- | :---- | :---- | :---- | :---- |\n", + "| OCR | `gpt-4.1` | Complex forms that are difficult to understand at a glance | `o3` | `gpt-4.1` is fast and cost-effective for most OCR. `o-3` has the ability to reason about form structure. |\n", + "| Results Refinement | `o4-mini` | Complex logic for inferring details, many function calls required. | `o3` | Better for very long chains of reasoning, especially with both function calls and structured output. |\n", + "\n", + "## 4\\. Evaluation Metrics\n", + "\n", + "Track key metrics to ensure the system is performing accurately and as expected.\n", + "\n", + "### Critical Metrics\n", + "\n", + "* **OCR Accuracy:** Per-character and per-word accuracy. \n", + "* **Inferred Field Rate:** Portion unfilled entries correctly inferred from either existing data or function calling. \n", + "* **Human Intervention Rate:** How often a document contains an UNKNOWN and must be referred to a human.\n", + "\n", + "We recommend building a labeled hold-out set of forms and their expected responses. This dataset should be representative of the expected deployment environment, see the [OpenAI evals](https://platform.openai.com/docs/guides/evals) guide for more detailed information on building and evaluating your system.\n", + "\n", + "## 5\\. Deployment Notes\n", + "\n", + "Moving from prototype to a production-ready system requires attention to operational details (LLMOps).\n", + "\n", + "### Cost Breakdown\n", + "\n", + "We will assume that for document ingestion, [batch pricing](https://platform.openai.com/docs/guides/batch) is a viable option due to high latency tolerance (i.e. overnight runs are fine).\n", + "\n", + "#### **Stage 1: OCR (Optical Character Recognition)**\n", + "\n", + "**Model:** `gpt-4.1`\n", + "\n", + "| Type | Tokens | Rate (per 1M) | Cost |\n", + "| :---- | :---- | :---- | :---- |\n", + "| Input | 2,000 | $1.00 | $0.002 |\n", + "| Output | 1,500 | $4.00 | $0.006 |\n", + "| **Total (Stage 1\\)** | | | **$8.00** |\n", + "\n", + "#### **Stage 2: Reasoning**\n", + "\n", + "**Model:** `o4-mini`\n", + "\n", + "| Type | Tokens | Rate (per 1M) | Cost |\n", + "| :---- | :---- | :---- | :---- |\n", + "| Input | 2,000 | $0.55 | $0.0011 |\n", + "| Output | 3,000 | $2.20 | $0.0066 |\n", + "| **Total (Stage 2\\)** | | | **$7.70** |\n", + "\n", + "#### Grand Total (per 1,000 pages): **$15.70**\n", + "\n", + "Compare this cost to a one-stage `o3` deployment. Assuming equal token usage and batch usage, the additional cost of the more powerful reasoning model would come to $70/1000 pages.\n", + "\n", + "### Monitoring & Deployment\n", + "\n", + "Monitor your system by logging key metrics:\n", + "\n", + "* `llm_model_used`, `llm_input_tokens`, `llm_output_tokens`, `llm_latency_ms` per model \n", + "* `total_query_latency_ms`, `estimated_query_cost` per model \n", + "* `function_calls_per_document`, `num_email_validation_calls` \n", + "* `human_review_required`\n", + "\n", + "Pin the specific model version identifier (e.g., `o4-mini-2025-04-16`) used in deployment via configuration/environment variables to prevent unexpected behavior from silent model updates.\n", + "\n", + "## 6\\. Useful Cookbooks & Resources\n", + "\n", + "Refer to these related resources for deeper dives into specific components:\n", + "\n", + "* [Structured Output](https://platform.openai.com/docs/guides/structured-outputs) \n", + "* [Vision Models](https://platform.openai.com/docs/guides/images) \n", + "* [Function Calling](https://platform.openai.com/docs/guides/function-calling)\n", + "\n", + "================================================================================\n", + "\n", + "\n", + "

Prototype to Production

\n", + "\n", + "Transitioning a prototype to production requires careful planning and execution. This checklist highlights critical steps, drawing from our flagship use cases, to ensure your deployment is robust, efficient, and meets business goals.\n", + "\n", + "## 🗂️ TL;DR Matrix\n", + "\n", + "| Checklist Area | Key Focus / Actions | Why it Matters |\n", + "| :---- | :---- | :---- |\n", + "| **Define Success Criteria** | • Define measurable KPIs & SLOs (accuracy, cost, latency). • Ensure targets are measurable via logs. | Provides clear targets; proves value. |\n", + "| **Document Model Rationale** | • Select initial models deliberately based on trade-offs. • Document the \"why\" behind model choices. | Justifies choices; aids future updates. |\n", + "| **Robust Evaluation & Testing** | • Build automated tests (\"eval suite\") using a golden set. • Focus on factuality, hallucinations, tool errors. • Test tool reliability & edge cases. | Ensures quality; prevents regressions before release. |\n", + "| **Observability & Cost** | • Implement essential logging for monitoring & debugging. • Set cost guardrails (token limits, usage modes). | Enables tuning; keeps spending within budget. |\n", + "| **Safety & Compliance** | • Use safety mechanisms (moderation APIs, prompts). • Enforce domain-specific compliance rules. • Mandate Human-in-the-Loop (HITL) for high-risk outputs. | Ensures responsible operation; meets requirements. |\n", + "| **Model Updates & Versioning** | • Define version pinning strategy • Implement A/B testing for new versions • Create rollback procedures | Maintains stability while allowing improvements. |\n", + "\n", + "1. **Define Success Criteria Quantitatively:** Move beyond \"it works\" to measurable targets *before* major development. \n", + " \n", + " * **Set Key Performance Indicators (KPIs) & SLOs:** Define specific targets for business value (e.g., RAG accuracy \\> 95%, OCR cost \\< $X/page) and performance (e.g., P95 latency \\< 1s, error rates). \n", + " * **Ensure Measurability:** Confirm that all KPIs and SLOs can be directly measured from system logs (e.g., tracking `total_tokens`, `critique_status`).\n", + "\n", + " \n", + "\n", + "2. **Document Initial Model Selection Rationale:** Justify your starting model choices for future reference. \n", + " \n", + " * **Choose Models Deliberately:** Use the Model-Intro Matrix and use cases to select appropriate models for each task (e.g., `o4-mini` for speed/cost, `gpt-4.1` for accuracy, `o3` for depth). \n", + " * **Record the \"Why\":** Briefly document the reasoning behind your choices (cost, latency, capability trade-offs) in code comments or design docs so future teams understand the context.\n", + "\n", + " \n", + "\n", + "3. **Implement Robust Evaluation & Testing:** Verify quality and prevent regressions *before* shipping changes. \n", + " \n", + " * **Build an Automated Eval Suite:** Create a repeatable test process using a \"golden set\" (50-100 diverse, expert-verified examples). Focus tests on `factuality`, `hallucination rate`, `tool-error rate`, and task-specific metrics. \n", + " * **Test Reliably:** Rigorously test integrated tool reliability (success rate, error handling) and system behavior under load and with edge cases (malformed data, adversarial inputs).\n", + "\n", + " \n", + "\n", + "4. **Establish Observability & Cost Controls:** Monitor performance and keep spending within budget. \n", + " \n", + " * **Set Cost Guardrails:** Prevent unexpected cost increases by defining max token limits per stage and considering operational modes (\"Fast,\" \"Standard,\" \"Thorough\") to balance cost and performance. \n", + " * **Implement Essential Logging:** Capture key operational data via structured logs for each processing stage to enable debugging and monitoring.\n", + "\n", + " \n", + "\n", + "5. **Implement Safety & Compliance Guardrails:** Ensure responsible operation and meet requirements. \n", + " \n", + " * **Use Safety Mechanisms:** Employ tools like OpenAI's moderation APIs, safety-focused system prompts, or sentinel models for checks, especially with user input or sensitive topics. \n", + " * **Enforce Compliance:** Build in checks relevant to your specific industry and risks (e.g., legal constraints, lab safety). \n", + " * **Require Human-in-the-Loop (HITL):** Mandate human review for low-confidence outputs, high-risk scenarios, or critical decisions, ensuring the workflow flags these items clearly.\n", + "\n", + "\n", + "6. **Manage Model Updates and Versioning:** Prepare for model evolution over time.\n", + " \n", + " * **Version Pinning Strategy:** Decide whether to pin to specific model versions for stability or automatically adopt new versions for improvements.\n", + " * **A/B Testing Framework:** Establish a process to evaluate new model versions against your key metrics before full deployment.\n", + " * **Rollback Plan:** Create a clear procedure for reverting to previous model versions if issues arise with updates.\n", + " * **Monitor Version Performance:** Track metrics across model versions to identify performance trends and inform future selection decisions.\n", + "\n", + "================================================================================\n", + "\n", + "\n", + "\n", + "## Adaptation Decision Tree\n", + "\n", + "![Model Selection Decision Tree](images/3D_model_selection_flowchart.png)\n", + "\n", + "## Communicating Model Selection to Non-Technical Stakeholders\n", + "\n", + "When explaining your model choices to business stakeholders, focus on these key points:\n", + "\n", + "1. **Align with Business Outcomes**: Explain how your model selection directly supports specific business goals (time savings, cost reduction, improved accuracy).\n", + "\n", + "2. **Translate Technical Metrics**: Convert technical considerations into business impact:\n", + " - \"This model reduces processing time from 5 seconds to 0.7 seconds, allowing us to handle customer inquiries 7x faster\"\n", + " - \"By using the mini variant, we can process 5x more documents within the same budget\"\n", + "\n", + "3. **Highlight Trade-offs**: Present clear scenarios for different models:\n", + " - \"Option A (GPT-4.1): Highest accuracy but higher cost - ideal for client-facing legal analysis\"\n", + " - \"Option B (GPT-4.1 mini): 90% of the accuracy at 30% of the cost - perfect for internal document processing\"\n", + "\n", + "4. **Use Concrete Examples**: Demonstrate the practical difference in outputs between models to illustrate the value proposition of each option.\n", + "\n", + "================================================================================\n", + "\n", + "\n", + "\n", + "## Appendices\n", + "\n", + "## Glossary of Key Terms\n", + "\n", + "| Term | Definition |\n", + "|------|------------|\n", + "| **Context Window** | The maximum number of tokens a model can process in a single request |\n", + "| **Hallucination** | When a model generates content that appears plausible but is factually incorrect or unsupported |\n", + "| **Latency** | The time delay between sending a request to a model and receiving a response |\n", + "| **LLM** | Large Language Model; an AI system trained on vast amounts of text data |\n", + "| **Prompt Engineering** | The practice of designing effective prompts to elicit desired outputs from AI models |\n", + "| **RAG** | Retrieval-Augmented Generation; combining information retrieval with text generation |\n", + "| **SOTA** | State-of-the-Art; representing the most advanced stage in a field at a given time |\n", + "| **Token** | The basic unit of text that models process (roughly 0.75 words in English) |\n", + "\n", + "## 6.1 Price and Utility Table (Apr 2025)\n", + "\n", + "| Model | Context Window | Input Price (per 1M tokens) | Output Price (per 1M tokens) | Best For |\n", + "|-------|----------------|-----------------------------|-----------------------------|----------|\n", + "| GPT-4.1 | 1M | \\$2.00 | \\$8.00 | Long-doc analytics, code review |\n", + "| GPT-4.1 mini | 1M | \\$0.40 | \\$1.60 | Production agents, balanced cost/performance |\n", + "| GPT-4.1 nano | 1M | \\$0.10 | \\$0.40 | High-throughput, cost-sensitive applications |\n", + "| GPT-4o | 128K | \\$5.00 | \\$15.00 | Real-time voice/vision chat |\n", + "| GPT-4o mini | 128K | \\$0.15 | \\$0.60 | Vision tasks, rapid analytics |\n", + "| o3 (low) | 200K | \\$10.00* | \\$40.00* | Bulk triage, catalog enrichment |\n", + "| o3 (med) | 200K | \\$10.00* | \\$40.00* | Knowledge base Q&A |\n", + "| o3 (high) | 200K | \\$10.00* | \\$40.00* | Multi-step reasoning, troubleshooting |\n", + "| o4-mini (low) | 200K | \\$1.10* | \\$4.40* | Vision tasks, rapid analytics |\n", + "| o4-mini (med) | 200K | \\$1.10* | \\$4.40* | Balanced vision + reasoning |\n", + "| o4-mini (high) | 200K | \\$1.10* | \\$4.40* | Deep reasoning with cost control |\n", + "\n", + "\\* *Note: The low/med/high settings affect token usage rather than base pricing. Higher settings may use more tokens for deeper reasoning, increasing per-request cost and latency.*\n", + "\n", + "## 6.2 Prompt-pattern Quick Sheet (Token vs Latency Deltas)\n", + "\n", + "| Prompt Pattern | Description | Token Impact | Latency Impact | Best Model Fit |\n", + "|----------------|-------------|--------------|----------------|----------------|\n", + "| **Self-Critique** | Ask model to evaluate its own answer before finalizing | +20-30% tokens | +15-25% latency | GPT-4.1, o3 |\n", + "| **Chain-of-Thought (CoT)** | Explicitly instruct to \"think step by step\" | +40-80% tokens | +30-50% latency | o3, o4-mini (high) |\n", + "| **Structured Outputs** | Use JSON schema or pydantic models for consistent formatting | +5-10% tokens | +5-10% latency | All models |\n", + "| **Zero-Token Memory** | Store context in external DB rather than in conversation | -70-90% tokens | -5-10% latency | GPT-4.1 family |\n", + "| **Skeleton-Fill-In** | Provide template structure for model to complete | -10-20% tokens | -5-15% latency | o4-mini, GPT-4.1 nano |\n", + "| **Self-Consistency** | Generate multiple answers and select most consistent | +200-300% tokens | +150-250% latency | o3 (high) |\n", + "| **Role-Playing** | Assign specific personas to model for specialized knowledge | +5-15% tokens | Neutral | GPT-4o, o4-mini |\n", + "| **Tournament Ranking** | Compare options pairwise rather than scoring individually | +50-100% tokens | +30-60% latency | o3, o4-mini (high) |\n", + "| **Tool-Calling Reflex** | Prompt model to call tools when uncertainty is detected | +10-30% tokens | +20-40% latency | o3, GPT-4.1 |\n", + "\n", + "## 6.3 Links to External Cookbooks & Docs\n", + "\n", + "### OpenAI Official Resources\n", + "- [OpenAI Cookbook Main Repository](https://cookbook.openai.com/)\n", + "- [Function Calling Guide](https://platform.openai.com/docs/guides/function-calling)\n", + "- [Vision Models Guide](https://platform.openai.com/docs/guides/vision)\n", + "- [Agents Documentation](https://platform.openai.com/docs/guides/agents)\n", + "- [Structured Outputs Guide](https://platform.openai.com/docs/guides/structured-outputs)\n", + "\n", + "### RAG & Retrieval\n", + "- [RAG on PDFs](https://cookbook.openai.com/examples/file_search_responses)\n", + "\n", + "### Specialized Use Cases\n", + "- [Voice Assistant with Agents SDK](https://cookbook.openai.com/examples/agents_sdk/app_assistant_voice_agents)\n", + "- [Multi-Tool Orchestration](https://cookbook.openai.com/examples/responses_api/responses_api_tool_orchestration)\n", + "- [Data Extraction and Transformation](https://cookbook.openai.com/examples/data_extraction_transformation)\n", + "\n", + "### Prompting & Model Selection\n", + "- [GPT-4.1 Prompting Guide](https://cookbook.openai.com/examples/gpt4-1_prompting_guide)\n", + "- [Prompt Engineering Best Practices](https://platform.openai.com/docs/guides/prompt-engineering)\n", + "\n", + "### Evaluation & Deployment\n", + "- [Getting Started with OpenAI Evals](https://cookbook.openai.com/examples/evaluation/getting_started_with_openai_evals)\n", + "- [How to use the Usage API and Cost API to monitor your OpenAI usage](https://cookbook.openai.com/examples/completions_usage_api)\n", + "\n", + "================================================================================\n", + "\n", + "\n", + "\n", + "## Contributors\n", + "\n", + "- Kashyap Coimbatore Murali\n", + "- Nate Harada \n", + "- Sai Prashanth Soundararaj \n", + "- Shikhar Kwatra " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/partners/model_selection_guide/tools.py b/examples/partners/model_selection_guide/tools.py new file mode 100644 index 0000000000..d24544f6b3 --- /dev/null +++ b/examples/partners/model_selection_guide/tools.py @@ -0,0 +1,312 @@ +""" +Mock implementations of tool functions for AI Co-Scientist. + +These are simple mocks of the external tools that would be used in a real implementation. +""" + +import random, logging +from typing import Dict, List, Any, Optional + +# Mock database of chemical properties +MOCK_CHEMICALS = { + "Palladium acetate": { + "solubility": "Soluble in chloroform, slightly soluble in acetone", + "melting_point": "205°C (decomposition)", + "hazards": "Irritant, potential carcinogen", + "approved_status": True, + "cost_per_gram": 85.50 + }, + "Triphenylphosphine": { + "solubility": "Soluble in ethanol, benzene, chloroform", + "melting_point": "80-82°C", + "hazards": "Irritant", + "approved_status": True, + "cost_per_gram": 12.75 + }, + "Triethylamine": { + "solubility": "Miscible with water, ethanol", + "melting_point": "-115°C", + "boiling_point": "89°C", + "hazards": "Flammable, corrosive", + "approved_status": True, + "cost_per_gram": 5.25 + }, + "Sodium borohydride": { + "solubility": "Soluble in water, methanol", + "melting_point": "400°C (decomposition)", + "hazards": "Flammable, water-reactive", + "approved_status": True, + "cost_per_gram": 8.90 + }, + "Dimethylformamide": { + "solubility": "Miscible with water, ethanol", + "boiling_point": "153°C", + "hazards": "Reproductive toxin, flammable", + "approved_status": True, + "cost_per_gram": 3.15 + }, + "Palladium chloride": { + "solubility": "Slightly soluble in water, soluble in HCl", + "melting_point": "679°C", + "hazards": "Irritant, potential carcinogen", + "approved_status": True, + "cost_per_gram": 75.20 + }, + "Potassium carbonate": { + "solubility": "Soluble in water", + "melting_point": "891°C", + "hazards": "Irritant", + "approved_status": True, + "cost_per_gram": 2.50 + }, + "Toluene": { + "solubility": "Immiscible with water, miscible with organic solvents", + "boiling_point": "110.6°C", + "hazards": "Flammable, CNS depressant", + "approved_status": True, + "cost_per_gram": 1.75 + }, + "Methanol": { + "solubility": "Miscible with water", + "boiling_point": "64.7°C", + "hazards": "Flammable, toxic", + "approved_status": True, + "cost_per_gram": 1.20 + }, + "XYZ-13": { + "solubility": "Slightly soluble in organic solvents", + "melting_point": "185-188°C", + "hazards": "Mild irritant", + "approved_status": True, + "cost_per_gram": 250.00 + } +} + +# Mock database of past experiment outcomes +MOCK_OUTCOMES = { + "XYZ-13": [ + { + "id": "exp-001", + "catalyst": "Palladium acetate", + "temperature": 85, + "solvent": "Dimethylformamide", + "yield": 62.3, + "duration": 36, + "notes": "Yield decreased at temperatures above 85°C." + }, + { + "id": "exp-002", + "catalyst": "Palladium chloride", + "temperature": 70, + "solvent": "Toluene", + "yield": 58.7, + "duration": 42, + "notes": "Lower temperature gave slightly lower yield but higher purity." + }, + { + "id": "exp-003", + "catalyst": "Palladium acetate", + "temperature": 90, + "solvent": "Methanol", + "yield": 45.2, + "duration": 28, + "notes": "Significant side products observed at this temperature." + } + ] +} + +# Mock literature database +MOCK_LITERATURE = [ + { + "title": "Palladium-Catalyzed Cross-Coupling for the Synthesis of XYZ Derivatives", + "authors": "Smith, J.L., et al.", + "journal": "Journal of Organic Chemistry", + "year": 2024, + "abstract": "Novel methods using palladium catalysts at moderate temperatures showed improved yields for XYZ-type compounds." + }, + { + "title": "Solvent Effects on the Yield of XYZ Compounds", + "authors": "Johnson, M.R., et al.", + "journal": "Chemical Communications", + "year": 2023, + "abstract": "Polar aprotic solvents demonstrated superior performance in the synthesis of XYZ compounds, with yields up to 70%." + }, + { + "title": "Temperature-Controlled Synthesis of Pharmaceutical Intermediates", + "authors": "Rodriguez, A., et al.", + "journal": "ACS Catalysis", + "year": 2024, + "abstract": "Lower temperature protocols (50-65°C) with extended reaction times showed reduced side products for sensitive compounds." + } +] + +def list_available_chemicals() -> Dict: + """Mock function to list all available chemicals in the database.""" + logging.info(f"(Tool) List available chemicals") + return { + "status": "success", + "available_chemicals": list(MOCK_CHEMICALS.keys()) + } + +def chem_lookup(chemical_name: str, property: Optional[str] = None) -> Dict: + """Mock function to look up chemical properties.""" + logging.info(f"(Tool) Chemical lookup: {chemical_name}, {property}") + # Check if chemical exists in our mock database + if chemical_name not in MOCK_CHEMICALS: + similar_chemicals = [c for c in MOCK_CHEMICALS.keys() if any(word in c.lower() for word in chemical_name.lower().split())] + return { + "status": "not_found", + "message": f"Chemical '{chemical_name}' not found in database.", + "similar_chemicals": similar_chemicals if similar_chemicals else [] + } + + # Return specific property if requested + if property and property in MOCK_CHEMICALS[chemical_name]: + return { + "status": "success", + "chemical": chemical_name, + "property": property, + "value": MOCK_CHEMICALS[chemical_name][property] + } + + # Return all properties + return { + "status": "success", + "chemical": chemical_name, + "properties": MOCK_CHEMICALS[chemical_name] + } + +def cost_estimator(reagents: List[Dict] = [], equipment: Optional[List[str]] = None, duration_hours: Optional[float] = None) -> Dict: + """Mock function to estimate the cost of reagents and procedures.""" + logging.info(f"(Tool) Cost estimator: {reagents}, {equipment}, {duration_hours}") + total_cost = 0 + reagent_costs = {} + equipment_costs = {} + labor_cost = 0 + + # Calculate reagent costs + for reagent in reagents: + # Mock: Use defaults for missing keys instead of returning errors + if not isinstance(reagent, dict): + reagent = {"name": "Unknown reagent", "amount": 1, "unit": "g"} + + name = reagent.get("name", "XYZ-13") + amount = reagent.get("amount", 1) # Default to 1 if amount is missing + unit = reagent.get("unit", "g") # Default to grams if unit not specified + + # Convert units to grams for calculation + amount_in_grams = amount + if unit.lower() == "mg": + amount_in_grams = amount / 1000 + elif unit.lower() == "kg": + amount_in_grams = amount * 1000 + + # Look up cost per gram + cost_per_gram = MOCK_CHEMICALS.get(name, {}).get("cost_per_gram", 10.0) # Default cost if not found + cost = amount_in_grams * cost_per_gram + reagent_costs[name] = cost + total_cost += cost + + # Add equipment costs if provided + if equipment: + for item in equipment: + # Mock equipment costs + if "hplc" in item.lower(): + cost = 250.0 + elif "nmr" in item.lower(): + cost = 350.0 + elif "reactor" in item.lower(): + cost = 150.0 + else: + cost = 50.0 + + equipment_costs[item] = cost + total_cost += cost + + # Add labor costs based on duration + if duration_hours: + labor_rate = 75.0 # Mock hourly rate + labor_cost = duration_hours * labor_rate + total_cost += labor_cost + + return { + "status": "success", + "total_cost": round(total_cost, 2), + "reagent_costs": reagent_costs, + "equipment_costs": equipment_costs, + "labor_cost": labor_cost, + "currency": "USD" + } + +def outcome_db(compound: str, parameter: Optional[str] = None, limit: int = 5) -> Dict: + """Mock function to query the database of past experiment outcomes.""" + logging.info(f"(Tool) Outcome DB: {compound}, {parameter}, {limit}") + if compound not in MOCK_OUTCOMES: + return { + "status": "not_found", + "message": f"No experiments found for compound '{compound}'." + } + + experiments = MOCK_OUTCOMES[compound] + + # Filter by parameter if provided + if parameter: + filtered_experiments = [exp for exp in experiments if parameter in exp] + if not filtered_experiments: + return { + "status": "parameter_not_found", + "message": f"No experiments with parameter '{parameter}' found for compound '{compound}'." + } + experiments = filtered_experiments + + # Limit the number of results + experiments = experiments[:limit] + + return { + "status": "success", + "compound": compound, + "experiments": experiments, + "count": len(experiments) + } + +def literature_search(query: str, filter: Optional[str] = None, limit: int = 3) -> Dict: + """Mock function to search scientific literature for relevant information.""" + logging.info(f"(Tool) Literature search: {query}, {filter}, {limit}") + # Simple keyword matching for demo purposes + keywords = [word.lower() for word in query.split()] + + matched_literature = [] + for paper in MOCK_LITERATURE: + # Check if any keyword appears in title or abstract + title_lower = paper["title"].lower() + abstract_lower = paper["abstract"].lower() + + if any(keyword in title_lower or keyword in abstract_lower for keyword in keywords): + matched_literature.append(paper) + + # Apply filter if provided + if filter: + filter_year_match = None + # Try to extract year from filter + import re + year_match = re.search(r'20\d\d', filter) + if year_match: + filter_year = int(year_match.group()) + matched_literature = [paper for paper in matched_literature if paper["year"] == filter_year] + + # Filter by journal if mentioned + filter_lower = filter.lower() + journal_matches = [paper for paper in matched_literature if filter_lower in paper["journal"].lower()] + if journal_matches: + matched_literature = journal_matches + + # Limit the number of results + matched_literature = matched_literature[:limit] + + return { + "status": "success", + "query": query, + "filter": filter, + "results": matched_literature, + "count": len(matched_literature) + } \ No newline at end of file diff --git a/registry.yaml b/registry.yaml index 56f910319a..42f8214e2c 100644 --- a/registry.yaml +++ b/registry.yaml @@ -4,6 +4,24 @@ # should build pages for, and indicates metadata such as tags, creation date and # authors for each page. +- title: Practial Guide for Model Selection for Real‑World Use Cases + path: examples/partners/model_selection_guide/model_selection_guide.ipynb + date: 2025-05-05 + authors: + - shikhar-cyber + - kashyapm-tribe + - saip-tribe + - nharada-tribe + tags: + - responses + - functions + - web-search + - tool calling + - RAG + - insurance + - legal + - pharma + - title: EvalsAPI Use-case - Detecting prompt regressions path: examples/evaluation/use-cases/regression.ipynb date: 2025-04-08