From 26b16ae9f50c93756de7135fa08a43309dd4eeb5 Mon Sep 17 00:00:00 2001 From: Robert Tinn Date: Mon, 12 May 2025 08:55:57 +0100 Subject: [PATCH 1/5] Draft image understanding cookbook --- .gitignore | 1 + .../image_understanding_with_rag.ipynb | 819 ++++++++++++++++++ 2 files changed, 820 insertions(+) create mode 100644 examples/multimodal/image_understanding_with_rag.ipynb diff --git a/.gitignore b/.gitignore index 28e6d66de2..73fd322c61 100644 --- a/.gitignore +++ b/.gitignore @@ -137,6 +137,7 @@ dmypy.json *.DS_Store tmp_* examples/fine-tuned_qa/local_cache/* +examples/multimodal/.local_cache/* # PyCharm files .idea/ diff --git a/examples/multimodal/image_understanding_with_rag.ipynb b/examples/multimodal/image_understanding_with_rag.ipynb new file mode 100644 index 0000000000..242cee2ae4 --- /dev/null +++ b/examples/multimodal/image_understanding_with_rag.ipynb @@ -0,0 +1,819 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Image Understanding with RAG using OpenAI's Vision & Responses APIs\n", + "\n", + "Welcome! This cookbook guides you through working with multimodal data, using OpenAI's Vision & Responses APIs, with image understanding and file search capabilities. It demonstrates how to build a RAG system, powered by GPT 4.1, that can analyse customer experiences, from their feedback which can be both visual and text-based.\n", + "\n", + "Many datasets are multimodal, often containing both text and image data. A good example of this is in radiology in healthcare, where patient records contain both image scan and written report. In addition, many real-world datasets are noisy and contain missing or incomplete data meaning valuable information can be missed with analysing multiple modalities.\n", + "\n", + "This guide covers a common use case in customer service, which is analysing the experience of customers. This guide will cover synthetic generation for text and image modalities, combining image analysis with file search for more robust, context-aware answers from a RAG system and it also leverages the Evals API to evaluate the performance gain of including image understanding in the RAG system.\n", + "\n", + "---\n", + "\n", + "## Overview\n", + "\n", + "---\n", + "\n", + "## Table of Contents\n", + "\n", + "1. [Setup & Dependencies](#setup-and-dependencies)\n", + "2. [Example Generations](#example-generations)\n", + "3. [Data Processing](#data-processing)\n", + " - Load synthetic datasets\n", + " - Merge data\n", + "4. [Populating Vector Store](#populating-vector-store)\n", + " - Upload data for file search\n", + " - Set up attribute filters\n", + "5. [Retrieval and Filtering](#retrieval-and-filtering)\n", + " - Test retrieval performance\n", + " - Apply attribute-based filters\n", + "6. [Evaluation and Analysis](#evaluation-and-analysis)\n", + " - Compare predictions to ground truth\n", + " - Analyze performance metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup and Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install openai evals pandas matplotlib tqdm ipython --upgrade --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "from io import BytesIO\n", + "from pathlib import Path\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from openai import OpenAI\n", + "from IPython.display import display, Image\n", + "from tqdm.notebook import tqdm\n", + "\n", + "cache_dir = Path('.local_cache')\n", + "cache_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + "client = OpenAI()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example Generations\n", + "\n", + "Given how expensive it can be generate high-quality training and evaluation data for machine learning tasks, utilising synthetic data can be an effective alternative. The OpenAI Image API can be used to generate synthetic images for this purpose and this cookbook uses the Responses API to generate synthetic text data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "prompt = (\"Gourmet pasta neatly plated with garnish and sides on a white ceramic plate, \"\n", + " \"photographed from above on a restaurant table. Soft shadows and vibrant colors.\")\n", + "cache_path = f\".local_cache/{hash(prompt)}.png\"\n", + "\n", + "if not os.path.exists(cache_path):\n", + " response = client.images.generate(\n", + " model=\"gpt-image-1\",\n", + " prompt=prompt,\n", + " size=\"1024x1024\"\n", + " )\n", + " \n", + " with open(cache_path, \"wb\") as f:\n", + " f.write(base64.b64decode(response.data[0].b64_json))\n", + " print(f\"Generated and cached: {cache_path}\")\n", + "\n", + "else:\n", + " print(f\"Loading from cache: {cache_path}\")\n", + "\n", + "display(Image(filename=cache_path))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_food_delivery_review(sentiment: str = 'positive') -> str:\n", + " \"\"\"\n", + " Generate a synthetic food delivery review with the specified sentiment.\n", + " \n", + " Args:\n", + " sentiment: An adjective such as 'positive' or 'negative'.\n", + " \n", + " Returns:\n", + " Generated review text\n", + " \"\"\"\n", + " prompt = \"Write a very concise, realistic customer review for a recent food delivery.\"\n", + " prompt += f\" The review should reflect a {sentiment} experience.\"\n", + " \n", + " response = client.responses.create(\n", + " model=\"gpt-4.1\",\n", + " input=[{\"role\": \"user\", \"content\": prompt}]\n", + " )\n", + " return response.output_text\n", + "\n", + "\n", + "review = generate_food_delivery_review()\n", + "print(review)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Processing\n", + "\n", + "In this instance, we will use a pre-generated synthetic dataset of customer feedback, which includes both short text snippets and images from customer reviews, sometimes combined. You can also generate your own synthetic dataset for this cookbook using the above examples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download the dataset\n", + "! mkdir -p .local_cache/images\n", + "! wget https://raw.githubusercontent.com/robtinn/image_understanding_rag_dataset/main/data/df.csv -O .local_cache/df.csv\n", + "\n", + "\n", + "! wget https://raw.githubusercontent.com/robtinn/image_understanding_rag_dataset/main/data/images/1.png -O .local_cache/images/1.png\n", + "! wget https://raw.githubusercontent.com/robtinn/image_understanding_rag_dataset/main/data/images/2.png -O .local_cache/images/2.png\n", + "! wget https://raw.githubusercontent.com/robtinn/image_understanding_rag_dataset/main/data/images/3.png -O .local_cache/images/3.png\n", + "! wget https://raw.githubusercontent.com/robtinn/image_understanding_rag_dataset/main/data/images/4.png -O .local_cache/images/4.png\n", + "! wget https://raw.githubusercontent.com/robtinn/image_understanding_rag_dataset/main/data/images/5.png -O .local_cache/images/5.png\n", + "! wget https://raw.githubusercontent.com/robtinn/image_understanding_rag_dataset/main/data/images/6.png -O .local_cache/images/6.png\n", + "! wget https://raw.githubusercontent.com/robtinn/image_understanding_rag_dataset/main/data/images/7.png -O .local_cache/images/7.png" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def encode_image(image_path: str) -> str:\n", + " \"\"\"Encode image file to base64 string.\"\"\"\n", + " with open(image_path, \"rb\") as f:\n", + " return base64.b64encode(f.read()).decode(\"utf-8\")\n", + "\n", + "\n", + "def analyze_image_sentiment(image_path: str) -> str:\n", + " \"\"\"Analyze food delivery image and return sentiment analysis.\"\"\"\n", + " base64_image = encode_image(image_path)\n", + " response = client.responses.create(\n", + " model=\"gpt-4.1\",\n", + " input=[{\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"input_text\",\n", + " \"text\": \"Analyze this food delivery image. Respond with a brief description and sentiment (positive/negative) in one line.\"\n", + " },\n", + " {\n", + " \"type\": \"input_image\",\n", + " \"image_url\": f\"data:image/jpeg;base64,{base64_image}\",\n", + " },\n", + " ],\n", + " }],\n", + " max_output_tokens=50,\n", + " temperature=0.2\n", + " )\n", + " return response.output_text.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\".local_cache/df.csv\")\n", + "cache_dir = Path(\".local_cache\")\n", + "\n", + "for idx, row in df[~df['image_path'].isna()].iterrows():\n", + " image_path = cache_dir / 'images' / row['image_path']\n", + " sentiment = analyze_image_sentiment(str(image_path))\n", + " df.at[idx, 'full_sentiment'] = f\"{row['text']} {sentiment}\" if pd.notna(row['text']) else sentiment\n", + " print(f\"Processed {row['image_path']}\")\n", + "\n", + "df['full_sentiment'] = df['full_sentiment'].fillna(df['text'])\n", + "\n", + "output_path = cache_dir / \"df_full_sentiment.csv\"\n", + "df.to_csv(output_path, index=False)\n", + "print(f\"\\nSaved results to {output_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_colwidth', 100) # Increase from default (50) to view full sentiment\n", + "display(df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Populating Vector Store" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example uses OpenAI's built-in vector store and file search capabilities to build a RAG system that can analyse customer experiences, from their feedback which can be both visual and text-based." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text_vector_store = client.vector_stores.create(\n", + " name=\"food_delivery_reviews_text\",\n", + " metadata={\n", + " \"purpose\": \"text_understanding\",\n", + " \"created_by\": \"notebook\",\n", + " \"version\": \"1.0\"\n", + " }\n", + ")\n", + "text_vector_store_id = text_vector_store.id\n", + "\n", + "text_image_vector_store = client.vector_stores.create(\n", + " name=\"food_delivery_reviews_text_image\",\n", + " metadata={\n", + " \"purpose\": \"text_image_understanding\",\n", + " \"created_by\": \"notebook\",\n", + " \"version\": \"1.0\"\n", + " }\n", + ")\n", + "text_image_vector_store_id = text_image_vector_store.id\n", + "\n", + "print(\"Vector Store IDs:\")\n", + "print(f\" Text: {text_vector_store_id}\")\n", + "print(f\" Text+Image: {text_image_vector_store_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# upload files to vector database and set metadata\n", + "\n", + "def upload_files_to_vector_store(vector_store_id, df):\n", + " file_ids = []\n", + " for i, row in tqdm(df.iterrows(), total=len(df), desc=\"Uploading context files\"):\n", + " file_stream = BytesIO(row[\"full_sentiment\"].encode('utf-8'))\n", + " file_stream.name = f\"context_{row.get('id', i)}_{row.get('month', '')}.txt\"\n", + " \n", + " file = client.vector_stores.files.upload(\n", + " vector_store_id=vector_store_id,\n", + " file=file_stream\n", + " )\n", + " file_ids.append(file.id)\n", + "\n", + " for i, row in tqdm(df.iterrows(), total=len(df), desc=\"Updating file attributes\"):\n", + " client.vector_stores.files.update(\n", + " vector_store_id=vector_store_id,\n", + " file_id=file_ids[i],\n", + " attributes={\"month\": row[\"month\"]}\n", + " )\n", + " import time\n", + " time.sleep(1) # TODO" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "upload_files_to_vector_store(text_image_vector_store_id, df)\n", + "upload_files_to_vector_store(text_image_vector_store_id, df) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retrieval and Filtering\n", + "\n", + "We can analyse our dataset with natural language queries with the help of File Search. For the text-only dataset, we see that information is missing that could inform our analysis.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Query the vector store for spaghetti reviews in July\n", + "query = \"What were the reviews like for the spaghetti?\"\n", + "print(f\"🔍 Query: {query}\\n\")\n", + "\n", + "# Execute the search with filtering\n", + "response = client.responses.create(\n", + " model=\"gpt-4.1\",\n", + " input=query,\n", + " tools=[{\n", + " \"type\": \"file_search\",\n", + " \"vector_store_ids\": [text_vector_store_id],\n", + " \"filters\": {\n", + " \"type\": \"eq\",\n", + " \"key\": \"month\",\n", + " \"value\": \"july\"\n", + " }\n", + " }]\n", + ")\n", + "\n", + "# Display the results\n", + "print(\"📝 Response:\")\n", + "print(\"-\" * 40)\n", + "print(response.output_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"What were the reviews like for the spaghetti?\"\n", + "print(f\"🔍 Query: {query}\\n\")\n", + "\n", + "response = client.responses.create(\n", + " model=\"gpt-4.1\",\n", + " input=query,\n", + " tools=[{\n", + " \"type\": \"file_search\",\n", + " \"vector_store_ids\": [text_image_vector_store_id],\n", + " \"filters\": {\n", + " \"type\": \"eq\",\n", + " \"key\": \"month\",\n", + " \"value\": \"july\"\n", + " }\n", + " }]\n", + ")\n", + "\n", + "print(\"📝 Response:\")\n", + "print(\"-\" * 40)\n", + "print(response.output_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can confirm if this is correct by checking the retrieved images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "IMAGE_ID_MAPPING = {\n", + " f\"context_{row['id']}_{row['month']}.txt\": row[\"image_path\"]\n", + " for _, row in df[~df['image_path'].isna()].iterrows()\n", + "}\n", + "\n", + "def display_retrieved_images(\n", + " response: Any,\n", + " cache_dir: str = \".local_cache\"\n", + ") -> Dict[str, str]:\n", + " \"\"\"\n", + " Display images from the retrieved search results.\n", + " \n", + " Args:\n", + " response: The response object from the search query\n", + " cache_dir: Directory where images are stored\n", + " \n", + " Returns:\n", + " Dict mapping filenames to image paths for the displayed images\n", + " \"\"\"\n", + " # Get the annotations from the response\n", + " try:\n", + " annotations = response.output[1].content[0].annotations\n", + " retrieved_files = {result.filename for result in annotations}\n", + " except (AttributeError, IndexError):\n", + " print(\"No search results found in the response.\")\n", + " return {}\n", + "\n", + "\n", + " # Display matching images\n", + " displayed_images = {}\n", + " for file in retrieved_files:\n", + " if file in IMAGE_ID_MAPPING and IMAGE_ID_MAPPING[file]:\n", + " image_path = Path(cache_dir) / 'images' / IMAGE_ID_MAPPING[file]\n", + " print(f\"Displaying image for {file}:\")\n", + " display(Image(str(image_path)))\n", + " displayed_images[file] = str(image_path)\n", + " \n", + " return displayed_images\n", + "\n", + "displayed = display_retrieved_images(response)\n", + "print(f\"Displayed {len(displayed)} images\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"Were there any negative reviews for pizza, and if so, was the pizza burnt?\"\n", + "print(f\"🔍 Query: {query}\\n\")\n", + "\n", + "response = client.responses.create(\n", + " model=\"gpt-4.1\",\n", + " input=query,\n", + " tools=[{\n", + " \"type\": \"file_search\",\n", + " \"vector_store_ids\": [text_image_vector_store_id],\n", + " \"filters\": {\n", + " \"type\": \"eq\",\n", + " \"key\": \"month\",\n", + " \"value\": \"june\"\n", + " }\n", + " }]\n", + ")\n", + "\n", + "print(\"📝 Response:\")\n", + "print(\"-\" * 40)\n", + "print(response.output_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can confirm if this is correct by checking the retrieved images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "displayed = display_retrieved_images(response)\n", + "print(f\"Displayed {len(displayed)} images\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation and Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As our dataset likely evolves over time and we want to evaluate new models, we can use the OpenAI Evaluation API to evaluate the performance of our system for sentiment analysis. In this simple example, using the string_check criteria we checked if the output was one of the three possible values: positive, negative, or unclear." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_evaluation_data(df, text_col=\"full_sentiment\", label_col=\"label\"):\n", + " \"\"\"Prepare data items for evaluation from DataFrame.\"\"\"\n", + " return [{\"item\": {\"input\": str(row[text_col]), \"ground_truth\": row[label_col]}} \n", + " for _, row in df.iterrows()]\n", + "\n", + "\n", + "# def create_eval_run(evaluation_data):\n", + "# eval_config = {\n", + "# \"type\": \"completions\",\n", + "# \"model\": \"gpt-4.1\",\n", + "# \"input_messages\": {\n", + "# \"type\": \"template\",\n", + "# \"template\": [\n", + "# {\n", + "# \"type\": \"message\",\n", + "# \"role\": \"user\",\n", + "# \"content\": {\n", + "# \"type\": \"input_text\",\n", + "# \"text\": \"Classify the sentiment of this food delivery review: {{ item.input }}. Categorize the request into one of \\\"positive\\\", \\\"negative\\\" or \\\"unclear\\\". Respond with only one of those words.\"\n", + "# }\n", + "# }\n", + "# ]\n", + "# },\n", + "# \"source\": {\n", + "# \"type\": \"file_content\",\n", + "# \"content\": evaluation_data\n", + "# }\n", + "# }\n", + "\n", + "# # Create and monitor evaluation run\n", + "# run = client.evals.runs.create(\n", + "# eval_id=eval_obj.id,\n", + "# data_source=eval_config\n", + "# )\n", + "\n", + "# print(\"✅ Evaluation run created successfully\")\n", + "# print(f\"Run ID: {run.id}\")\n", + "# return run.id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_evaluation_data(\n", + " df: pd.DataFrame,\n", + " text_col: str = \"full_sentiment\",\n", + " label_col: str = \"label\"\n", + ") -> list:\n", + " \"\"\"\n", + " Prepare evaluation data items from a DataFrame.\n", + " \n", + " Args:\n", + " df: Input pandas DataFrame.\n", + " text_col: Column containing the input text.\n", + " label_col: Column containing the ground truth label.\n", + " \n", + " Returns:\n", + " List of dicts formatted for evaluation.\n", + " \"\"\"\n", + " return [\n", + " {\"item\": {\"input\": str(row[text_col]), \"ground_truth\": row[label_col]}}\n", + " for _, row in df.iterrows()\n", + " ]\n", + "\n", + "def create_eval_run(evaluation_data: list, eval_id: str) -> str:\n", + " \"\"\"\n", + " Create and launch an evaluation run.\n", + " \n", + " Args:\n", + " evaluation_data: List of evaluation items.\n", + " eval_id: The evaluation object ID.\n", + " \n", + " Returns:\n", + " The run ID as a string.\n", + " \"\"\"\n", + " eval_config = {\n", + " \"type\": \"completions\",\n", + " \"model\": \"gpt-4.1\",\n", + " \"input_messages\": {\n", + " \"type\": \"template\",\n", + " \"template\": [\n", + " {\n", + " \"type\": \"message\",\n", + " \"role\": \"user\",\n", + " \"content\": {\n", + " \"type\": \"input_text\",\n", + " \"text\": (\n", + " \"Classify the sentiment of this food delivery review: {{ item.input }}. \"\n", + " \"Categorize the request into one of \\\"positive\\\", \\\"negative\\\" or \\\"unclear\\\". \"\n", + " \"Respond with only one of those words.\"\n", + " )\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " \"source\": {\n", + " \"type\": \"file_content\",\n", + " \"content\": evaluation_data\n", + " }\n", + " }\n", + "\n", + " run = client.evals.runs.create(\n", + " eval_id=eval_id,\n", + " data_source=eval_config\n", + " )\n", + " print(\"✅ Evaluation run created successfully\")\n", + " print(f\"Run ID: {run.id}\")\n", + " return run.id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eval_obj = client.evals.create(\n", + " name=\"food-categorization-eval\",\n", + " data_source_config={\n", + " \"type\": \"custom\",\n", + " \"item_schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"input\": {\"type\": \"string\"},\n", + " \"ground_truth\": {\"type\": \"string\"}\n", + " },\n", + " \"required\": [\"input\", \"ground_truth\"]\n", + " },\n", + " \"include_sample_schema\": True\n", + " },\n", + " testing_criteria=[\n", + " {\n", + " \"type\": \"string_check\",\n", + " \"name\": \"Match output to human label\",\n", + " \"input\": \"{{sample.output_text}}\",\n", + " \"reference\": \"{{item.ground_truth}}\",\n", + " \"operation\": \"eq\"\n", + " }\n", + " ]\n", + ")\n", + "eval_id = eval_obj.id\n", + "eval_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create evaluation runs\n", + "\n", + "evaluation_data = prepare_evaluation_data(df, text_col=\"text\")\n", + "text_only_run_id = create_eval_run(evaluation_data, eval_id)\n", + "\n", + "evaluation_data = prepare_evaluation_data(df)\n", + "text_image_run_id = create_eval_run(evaluation_data, eval_id)\n", + "\n", + "# retrieve both run urls\n", + "\n", + "text_only_run = client.evals.runs.retrieve(eval_id=eval_id, run_id=text_only_run_id)\n", + "print(text_only_run.to_dict()['report_url'])\n", + "\n", + "text_image_run = client.evals.runs.retrieve(eval_id=eval_obj.id, run_id=text_image_run_id)\n", + "print(text_image_run.to_dict()['report_url'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can retrieve the results of these evaluation runs and perform some local analysis. In this case, we will compare the performance of the text-only and text+image runs and evaluate how increasing the number of total tokens (through the addition of image context) affects the accuracy of the model. We can also do some basic error analysis by analysing the model input of the failed examples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text_only_run_output_items = client.runs.output_items(text_only_run)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate passed and total for text_only_run\n", + "text_only_data = text_only_run_output_items.to_dict()['data']\n", + "text_only_passed = sum(1 for output_item in text_only_data if output_item['results'][0]['passed'])\n", + "text_only_total = len(text_only_data)\n", + "\n", + "# Calculate passed and total for text_image_run\n", + "text_image_data = text_image_run_output_items.to_dict()['data']\n", + "text_image_passed = sum(1 for output_item in text_image_data if output_item['results'][0]['passed'])\n", + "text_image_total = len(text_image_data)\n", + "\n", + "# Calculate average total_tokens for each run\n", + "def avg_total_tokens(data):\n", + " tokens = [item['sample']['usage']['total_tokens'] for item in data if 'usage' in item['sample']]\n", + " return sum(tokens) / len(tokens) if tokens else 0\n", + "\n", + "text_only_avg_tokens = avg_total_tokens(text_only_data)\n", + "text_image_avg_tokens = avg_total_tokens(text_image_data)\n", + "\n", + "# Plotting\n", + "labels = ['Text Only', 'Text + Image']\n", + "passed = [text_only_passed, text_image_passed]\n", + "avg_tokens = [text_only_avg_tokens, text_image_avg_tokens]\n", + "\n", + "x = np.arange(len(labels))\n", + "width = 0.35\n", + "\n", + "fig, ax1 = plt.subplots()\n", + "\n", + "# Bar for passed only\n", + "bars1 = ax1.bar(x - width/2, passed, width, label='Passed', color='green')\n", + "ax1.set_ylabel('Accuracy')\n", + "ax1.set_xticks(x)\n", + "ax1.set_xticklabels(labels)\n", + "ax1.set_title('Accuracy and Avg Total Tokens')\n", + "ax1.legend(loc='upper left')\n", + "\n", + "# Second y-axis for avg total tokens\n", + "ax2 = ax1.twinx()\n", + "bars2 = ax2.bar(x + width/2, avg_tokens, width, label='Avg Total Tokens', color='blue', alpha=0.5)\n", + "ax2.set_ylabel('Avg Total Tokens')\n", + "ax2.legend(loc='upper right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "failed_samples = [\n", + " {\n", + " \"Input\": sample['sample']['input'],\n", + " \"Model Output\": sample['sample']['output']\n", + " }\n", + " for sample in text_only_run_output_items.to_dict()['data']\n", + " if not sample['results'][0]['passed']\n", + "]\n", + "\n", + "pd.set_option('display.max_colwidth', 150) # Adjust as needed\n", + "\n", + "failed_df = pd.DataFrame(failed_samples)\n", + "display(failed_df.style.set_properties(**{'text-align': 'left'}))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's clean up some of the resources we created." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# delete vector store\n", + "client.vector_stores.delete(vector_store)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "openai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From d68a124316574b7e7a0dd39092180cd25344f1ed Mon Sep 17 00:00:00 2001 From: Robert Tinn Date: Mon, 12 May 2025 12:35:55 +0100 Subject: [PATCH 2/5] updated image understanding vector stores --- .../image_understanding_with_rag.ipynb | 135 +++++++++--------- 1 file changed, 65 insertions(+), 70 deletions(-) diff --git a/examples/multimodal/image_understanding_with_rag.ipynb b/examples/multimodal/image_understanding_with_rag.ipynb index 242cee2ae4..f70da378a6 100644 --- a/examples/multimodal/image_understanding_with_rag.ipynb +++ b/examples/multimodal/image_understanding_with_rag.ipynb @@ -6,11 +6,11 @@ "source": [ "# Image Understanding with RAG using OpenAI's Vision & Responses APIs\n", "\n", - "Welcome! This cookbook guides you through working with multimodal data, using OpenAI's Vision & Responses APIs, with image understanding and file search capabilities. It demonstrates how to build a RAG system, powered by GPT 4.1, that can analyse customer experiences, from their feedback which can be both visual and text-based.\n", + "Welcome! This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using OpenAI’s Vision and Responses APIs. It focuses on multimodal data, specifically, combining image and text inputs to analyze customer experiences. The system leverages GPT-4.1 and integrates image understanding with file search to provide context-aware responses.\n", "\n", - "Many datasets are multimodal, often containing both text and image data. A good example of this is in radiology in healthcare, where patient records contain both image scan and written report. In addition, many real-world datasets are noisy and contain missing or incomplete data meaning valuable information can be missed with analysing multiple modalities.\n", + "Multimodal datasets are increasingly common, particularly in domains like healthcare, where records often contain both visual data (e.g. radiology scans) and accompanying text (e.g. clinical notes). Real-world datasets also tend to be noisy, with incomplete or missing information, making it critical to analyze multiple modalities in tandem.\n", "\n", - "This guide covers a common use case in customer service, which is analysing the experience of customers. This guide will cover synthetic generation for text and image modalities, combining image analysis with file search for more robust, context-aware answers from a RAG system and it also leverages the Evals API to evaluate the performance gain of including image understanding in the RAG system.\n", + "This guide focuses on a customer service use case: evaluating customer feedback that may include screenshots, photos, and written complaints. You’ll learn how to synthetically generate both image and text inputs, use file search for context retrieval, and apply the Evals API to assess how incorporating image understanding impacts overall performance.\n", "\n", "---\n", "\n", @@ -49,7 +49,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install openai evals pandas matplotlib tqdm ipython --upgrade --quiet" + "%pip install openai evals pandas numpy matplotlib tqdm ipython --upgrade --quiet" ] }, { @@ -60,9 +60,11 @@ "source": [ "import base64\n", "from io import BytesIO\n", + "import os\n", "from pathlib import Path\n", "\n", "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import pandas as pd\n", "from openai import OpenAI\n", "from IPython.display import display, Image\n", @@ -80,7 +82,7 @@ "source": [ "## Example Generations\n", "\n", - "Given how expensive it can be generate high-quality training and evaluation data for machine learning tasks, utilising synthetic data can be an effective alternative. The OpenAI Image API can be used to generate synthetic images for this purpose and this cookbook uses the Responses API to generate synthetic text data." + "Generating high-quality training and evaluation data for machine learning tasks can be costly and time-consuming. Synthetic data offers a practical and scalable alternative. In this notebook, the OpenAI Image API is used to generate synthetic images, while the Responses API is employed to create synthetic text, enabling efficient prototyping and experimentation across multimodal tasks." ] }, { @@ -147,7 +149,7 @@ "source": [ "## Data Processing\n", "\n", - "In this instance, we will use a pre-generated synthetic dataset of customer feedback, which includes both short text snippets and images from customer reviews, sometimes combined. You can also generate your own synthetic dataset for this cookbook using the above examples." + "In this example, we’ll work with a pre-generated synthetic dataset of customer feedback that includes short text snippets, images from customer reviews, and occasionally combined multimodal entries. You can also generate your own synthetic dataset using the examples provided above to tailor the data to your specific use case." ] }, { @@ -249,7 +251,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This example uses OpenAI's built-in vector store and file search capabilities to build a RAG system that can analyse customer experiences, from their feedback which can be both visual and text-based." + "This example uses OpenAI's built-in vector store and file search capabilities to build a RAG system that can analyse customer experiences, from their feedback which can be both visual and text-based. We create two vector stores for comparisons, one with image understanding and one without." ] }, { @@ -291,10 +293,13 @@ "source": [ "# upload files to vector database and set metadata\n", "\n", - "def upload_files_to_vector_store(vector_store_id, df):\n", + "def upload_files_to_vector_store(vector_store_id, df, column_name=\"full_sentiment\"):\n", " file_ids = []\n", " for i, row in tqdm(df.iterrows(), total=len(df), desc=\"Uploading context files\"):\n", - " file_stream = BytesIO(row[\"full_sentiment\"].encode('utf-8'))\n", + " if pd.isna(row[column_name]):\n", + " file_stream = BytesIO('No information available.'.encode('utf-8'))\n", + " else:\n", + " file_stream = BytesIO(row[column_name].encode('utf-8'))\n", " file_stream.name = f\"context_{row.get('id', i)}_{row.get('month', '')}.txt\"\n", " \n", " file = client.vector_stores.files.upload(\n", @@ -308,9 +313,7 @@ " vector_store_id=vector_store_id,\n", " file_id=file_ids[i],\n", " attributes={\"month\": row[\"month\"]}\n", - " )\n", - " import time\n", - " time.sleep(1) # TODO" + " )" ] }, { @@ -319,8 +322,17 @@ "metadata": {}, "outputs": [], "source": [ - "upload_files_to_vector_store(text_image_vector_store_id, df)\n", - "upload_files_to_vector_store(text_image_vector_store_id, df) " + "upload_files_to_vector_store(text_image_vector_store_id, df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "upload_files_to_vector_store(text_vector_store_id, df, column_name=\"text\") " ] }, { @@ -332,6 +344,13 @@ "We can analyse our dataset with natural language queries with the help of File Search. For the text-only dataset, we see that information is missing that could inform our analysis.\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The only positive review for spaghetti in July has visual feedback and we can see the RAG system with only text based context available is uncertain about positive details. However with image context provided the second RAG system is able to provide a more accurate response." + ] + }, { "cell_type": "code", "execution_count": null, @@ -339,7 +358,7 @@ "outputs": [], "source": [ "# Query the vector store for spaghetti reviews in July\n", - "query = \"What were the reviews like for the spaghetti?\"\n", + "query = \"What were the reviews like for the spaghetti presentation?\"\n", "print(f\"🔍 Query: {query}\\n\")\n", "\n", "# Execute the search with filtering\n", @@ -369,7 +388,7 @@ "metadata": {}, "outputs": [], "source": [ - "query = \"What were the reviews like for the spaghetti?\"\n", + "query = \"What were the reviews like for the spaghetti presentation?\"\n", "print(f\"🔍 Query: {query}\\n\")\n", "\n", "response = client.responses.create(\n", @@ -410,9 +429,9 @@ "}\n", "\n", "def display_retrieved_images(\n", - " response: Any,\n", + " response,\n", " cache_dir: str = \".local_cache\"\n", - ") -> Dict[str, str]:\n", + "):\n", " \"\"\"\n", " Display images from the retrieved search results.\n", " \n", @@ -447,6 +466,13 @@ "print(f\"Displayed {len(displayed)} images\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Likewise we can test this for negative reviews in June." + ] + }, { "cell_type": "code", "execution_count": null, @@ -518,46 +544,6 @@ " for _, row in df.iterrows()]\n", "\n", "\n", - "# def create_eval_run(evaluation_data):\n", - "# eval_config = {\n", - "# \"type\": \"completions\",\n", - "# \"model\": \"gpt-4.1\",\n", - "# \"input_messages\": {\n", - "# \"type\": \"template\",\n", - "# \"template\": [\n", - "# {\n", - "# \"type\": \"message\",\n", - "# \"role\": \"user\",\n", - "# \"content\": {\n", - "# \"type\": \"input_text\",\n", - "# \"text\": \"Classify the sentiment of this food delivery review: {{ item.input }}. Categorize the request into one of \\\"positive\\\", \\\"negative\\\" or \\\"unclear\\\". Respond with only one of those words.\"\n", - "# }\n", - "# }\n", - "# ]\n", - "# },\n", - "# \"source\": {\n", - "# \"type\": \"file_content\",\n", - "# \"content\": evaluation_data\n", - "# }\n", - "# }\n", - "\n", - "# # Create and monitor evaluation run\n", - "# run = client.evals.runs.create(\n", - "# eval_id=eval_obj.id,\n", - "# data_source=eval_config\n", - "# )\n", - "\n", - "# print(\"✅ Evaluation run created successfully\")\n", - "# print(f\"Run ID: {run.id}\")\n", - "# return run.id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ "def prepare_evaluation_data(\n", " df: pd.DataFrame,\n", " text_col: str = \"full_sentiment\",\n", @@ -687,22 +673,16 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, "source": [ - "We can retrieve the results of these evaluation runs and perform some local analysis. In this case, we will compare the performance of the text-only and text+image runs and evaluate how increasing the number of total tokens (through the addition of image context) affects the accuracy of the model. We can also do some basic error analysis by analysing the model input of the failed examples." + "text_only_run_output_items = client.evals.runs.output_items.list(eval_id=eval_id, run_id=text_only_run_id)\n", + "text_image_run_output_items = client.evals.runs.output_items.list(eval_id=eval_id, run_id=text_image_run_id)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "text_only_run_output_items = client.runs.output_items(text_only_run)" + "We can retrieve the results of these evaluation runs and perform some local analysis. In this case, we will compare the performance of the text-only and text+image runs and evaluate how increasing the number of total tokens (through the addition of image context) affects the accuracy of the model. We can also do some basic error analysis by analysing the model input of the failed examples." ] }, { @@ -790,9 +770,24 @@ "metadata": {}, "outputs": [], "source": [ - "# delete vector store\n", - "client.vector_stores.delete(vector_store)" + "# delete vector stores\n", + "deleted_vector_store = client.vector_stores.delete(\n", + " vector_store_id=text_vector_store_id\n", + ")\n", + "print(deleted_vector_store)\n", + "\n", + "deleted_vector_store = client.vector_stores.delete(\n", + " vector_store_id=text_image_vector_store_id\n", + ")\n", + "print(deleted_vector_store)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From d34d56ccdfd329395559bcd6ea1b80f46200833f Mon Sep 17 00:00:00 2001 From: Robert Tinn Date: Mon, 12 May 2025 12:58:38 +0100 Subject: [PATCH 3/5] Image understanding tidy up prompts --- .../multimodal/image_understanding_with_rag.ipynb | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/examples/multimodal/image_understanding_with_rag.ipynb b/examples/multimodal/image_understanding_with_rag.ipynb index f70da378a6..93b686a198 100644 --- a/examples/multimodal/image_understanding_with_rag.ipynb +++ b/examples/multimodal/image_understanding_with_rag.ipynb @@ -322,16 +322,7 @@ "metadata": {}, "outputs": [], "source": [ - "upload_files_to_vector_store(text_image_vector_store_id, df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", + "upload_files_to_vector_store(text_image_vector_store_id, df)\n", "upload_files_to_vector_store(text_vector_store_id, df, column_name=\"text\") " ] }, @@ -358,7 +349,7 @@ "outputs": [], "source": [ "# Query the vector store for spaghetti reviews in July\n", - "query = \"What were the reviews like for the spaghetti presentation?\"\n", + "query = \"Where there any comments about the 'spaghetti'?\"\n", "print(f\"🔍 Query: {query}\\n\")\n", "\n", "# Execute the search with filtering\n", @@ -388,7 +379,7 @@ "metadata": {}, "outputs": [], "source": [ - "query = \"What were the reviews like for the spaghetti presentation?\"\n", + "query = \"Where there any comments about the 'spaghetti'?\"\n", "print(f\"🔍 Query: {query}\\n\")\n", "\n", "response = client.responses.create(\n", From 419d4428b6be12418741d54d03713546eb9689f4 Mon Sep 17 00:00:00 2001 From: Robert Tinn Date: Tue, 13 May 2025 09:54:22 +0100 Subject: [PATCH 4/5] Small comments to image_understanding notebook --- .../image_understanding_with_rag.ipynb | 30 +++++++------------ 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/examples/multimodal/image_understanding_with_rag.ipynb b/examples/multimodal/image_understanding_with_rag.ipynb index 93b686a198..97473732f1 100644 --- a/examples/multimodal/image_understanding_with_rag.ipynb +++ b/examples/multimodal/image_understanding_with_rag.ipynb @@ -6,11 +6,11 @@ "source": [ "# Image Understanding with RAG using OpenAI's Vision & Responses APIs\n", "\n", - "Welcome! This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using OpenAI’s Vision and Responses APIs. It focuses on multimodal data, specifically, combining image and text inputs to analyze customer experiences. The system leverages GPT-4.1 and integrates image understanding with file search to provide context-aware responses.\n", + "Welcome! This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using OpenAI’s Vision and Responses APIs. It focuses on multimodal data, combining image and text inputs to analyze customer experiences. The system leverages GPT-4.1 and integrates image understanding with file search to provide context-aware responses.\n", "\n", "Multimodal datasets are increasingly common, particularly in domains like healthcare, where records often contain both visual data (e.g. radiology scans) and accompanying text (e.g. clinical notes). Real-world datasets also tend to be noisy, with incomplete or missing information, making it critical to analyze multiple modalities in tandem.\n", "\n", - "This guide focuses on a customer service use case: evaluating customer feedback that may include screenshots, photos, and written complaints. You’ll learn how to synthetically generate both image and text inputs, use file search for context retrieval, and apply the Evals API to assess how incorporating image understanding impacts overall performance.\n", + "This guide focuses on a customer service use case: evaluating customer feedback that may include photos, and written reviews. You’ll learn how to synthetically generate both image and text inputs, use file search for context retrieval, and apply the Evals API to assess how incorporating image understanding impacts overall performance.\n", "\n", "---\n", "\n", @@ -251,7 +251,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This example uses OpenAI's built-in vector store and file search capabilities to build a RAG system that can analyse customer experiences, from their feedback which can be both visual and text-based. We create two vector stores for comparisons, one with image understanding and one without." + "This example uses OpenAI's built-in vector store and file search capabilities to build a RAG system that can analyse customer experiences from their feedback, which can be both visual and text-based. We create two vector stores for comparisons, one with image understanding and one without." ] }, { @@ -323,7 +323,7 @@ "outputs": [], "source": [ "upload_files_to_vector_store(text_image_vector_store_id, df)\n", - "upload_files_to_vector_store(text_vector_store_id, df, column_name=\"text\") " + "upload_files_to_vector_store(text_vector_store_id, df, column_name=\"text\")" ] }, { @@ -332,14 +332,9 @@ "source": [ "# Retrieval and Filtering\n", "\n", - "We can analyse our dataset with natural language queries with the help of File Search. For the text-only dataset, we see that information is missing that could inform our analysis.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The only positive review for spaghetti in July has visual feedback and we can see the RAG system with only text based context available is uncertain about positive details. However with image context provided the second RAG system is able to provide a more accurate response." + "We can analyse our dataset with natural language queries with the help of File Search. For the text-only dataset, we see that information is missing that could inform our analysis.\n", + "\n", + "The only positive review for spaghetti in July has visual feedback and we can see the RAG system with only text based context available is uncertain about positive details. However with image context provided the second RAG system is able to provide a more accurate response.\n" ] }, { @@ -461,7 +456,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Likewise we can test this for negative reviews in June." + "Likewise we can test this for negative reviews in June concerning any burnt pizza." ] }, { @@ -665,6 +660,8 @@ "metadata": {}, "outputs": [], "source": [ + "# you may need to wait a few seconds before running this cell for the eval runs to finish up\n", + "\n", "text_only_run_output_items = client.evals.runs.output_items.list(eval_id=eval_id, run_id=text_only_run_id)\n", "text_image_run_output_items = client.evals.runs.output_items.list(eval_id=eval_id, run_id=text_image_run_id)" ] @@ -772,13 +769,6 @@ ")\n", "print(deleted_vector_store)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 1a1f0462636ceb8c159a6e68f05b294a197f1293 Mon Sep 17 00:00:00 2001 From: Robert Tinn Date: Fri, 16 May 2025 20:56:22 +0100 Subject: [PATCH 5/5] Image understanding updating authors --- authors.yaml | 5 +++++ registry.yaml | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/authors.yaml b/authors.yaml index 099b8b735c..0834085cc6 100644 --- a/authors.yaml +++ b/authors.yaml @@ -3,6 +3,11 @@ # You can optionally customize how your information shows up cookbook.openai.com over here. # If your information is not present here, it will be pulled from your GitHub profile. +robert-tinn: + name: "Robert Tinn" + website: "https://www.linkedin.com/in/robert-tinn/" + avatar: "https://avatars.githubusercontent.com/u/208724428?v=4" + minh-hoque: name: "Minhajul Hoque" website: "https://www.linkedin.com/in/minhajul-hoque-83242b163/" diff --git a/registry.yaml b/registry.yaml index c8506ad716..981b0f0860 100644 --- a/registry.yaml +++ b/registry.yaml @@ -4,6 +4,17 @@ # should build pages for, and indicates metadata such as tags, creation date and # authors for each page. +- title: Image Understanding with RAG + path: examples/multimodal/image_understanding_with_rag.ipynb + date: 2025-05-16 + authors: + - robert-tinn + tags: + - responses + - images + - RAG + - vision + - title: Comparing Speech-to-Text Methods with the OpenAI API path: examples/Speech_transcription_methods.ipynb date: 2025-04-29