diff --git a/examples/Reinforcement_Fine_Tuning.ipynb b/examples/Reinforcement_Fine_Tuning.ipynb index 6bd67eefd2..fcb7193209 100644 --- a/examples/Reinforcement_Fine_Tuning.ipynb +++ b/examples/Reinforcement_Fine_Tuning.ipynb @@ -61,9 +61,17 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 1, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/theophile/Documents/repos/jupyter-env/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -97,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -130,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -153,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -215,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -246,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -269,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -294,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -431,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -544,49 +552,49 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 329740.88it/s]\n" + "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 610524.60it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{'total_samples': 100, 'accuracy': 0.5716752010712578}\n" + "{'total_samples': 100, 'accuracy': 0.590985993228499}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 497544.96it/s]\n" + "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 311612.48it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{'total_samples': 100, 'accuracy': 0.5855097792577905}\n" + "{'total_samples': 100, 'accuracy': 0.5750433490539723}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 414456.92it/s]" + "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 769597.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{'total_samples': 100, 'accuracy': 0.5702082734545793}\n" + "{'total_samples': 100, 'accuracy': 0.5943742483874717}\n" ] }, { @@ -625,7 +633,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -633,32 +641,32 @@ "output_type": "stream", "text": [ "\n", - "Total mistakes: 84\n", + "Total mistakes: 86\n", "\n", - "[Sample 16]\n", - " Model prediction: enveloped double stranded linear dna virus\n", - " Reference answer: double-stranded, enveloped dna virus\n", - " Score: 0.85\n", + "[Sample 18]\n", + " Model prediction: acute anterior uveitis\n", + " Reference answer: recurring eye redness and pain\n", + " Score: 0.3596153846153846\n", "\n", "[Sample 19]\n", - " Model prediction: gallstone ileus\n", - " Reference answer: gall stone ileus\n", - " Score: 0.8225806451612904\n", + " Model prediction: 390 meq\n", + " Reference answer: 150 meq\n", + " Score: 0.6071428571428571\n", "\n", "[Sample 20]\n", - " Model prediction: acute rheumatic fever\n", - " Reference answer: postinfectious glomerulonephritis\n", - " Score: 0.22037037037037036\n", + " Model prediction: adamts13 deficiency\n", + " Reference answer: decreased adamts13 activity in serum\n", + " Score: 0.5037037037037037\n", "\n", "[Sample 22]\n", - " Model prediction: amygdala\n", - " Reference answer: hippocampus\n", - " Score: 0.17894736842105263\n", + " Model prediction: todd paralysis\n", + " Reference answer: seizure\n", + " Score: 0.16190476190476194\n", "\n", "[Sample 23]\n", - " Model prediction: hypopituitarism\n", - " Reference answer: pituitary adenoma\n", - " Score: 0.47812499999999997\n" + " Model prediction: hypokalemia\n", + " Reference answer: hypomagnesemia\n", + " Score: 0.612\n" ] } ], @@ -694,22 +702,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 84, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -734,49 +742,49 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 489988.79it/s]\n" + "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 820803.13it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{'total_samples': 100, 'accuracy': 0.6150339441350683}\n" + "{'total_samples': 100, 'accuracy': 0.6186850707880021}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 507170.98it/s]\n" + "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 523633.46it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{'total_samples': 100, 'accuracy': 0.5901906182115139}\n" + "{'total_samples': 100, 'accuracy': 0.6149897683385446}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 543303.63it/s]" + "Grading predictions: 100%|██████████| 100/100 [00:00<00:00, 515270.76it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{'total_samples': 100, 'accuracy': 0.5927679005876193}\n" + "{'total_samples': 100, 'accuracy': 0.6254662232084496}\n" ] }, { @@ -802,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -863,12 +871,12 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -928,7 +936,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -988,7 +996,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -1020,7 +1028,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -1056,21 +1064,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ + "response_format = {\n", + " \"name\": \"float_score_classification\",\n", + " \"strict\": True,\n", + " \"schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"steps\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"A sequence of steps outlining the reasoning process.\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"description\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Detailed description of the reasoning in this step.\"\n", + " },\n", + " \"conclusion\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The conclusion of the reasoning in this step.\"\n", + " }\n", + " },\n", + " \"required\": [\"description\", \"conclusion\"],\n", + " \"additionalProperties\": False\n", + " }\n", + " },\n", + " \"result\": {\n", + " \"type\": \"number\",\n", + " \"description\": \"The float score assigned to the response. This should be in inclusive range RANGE_MIN to RANGE_MAX.\"\n", + " }\n", + " },\n", + " \"required\": [\"steps\", \"result\"],\n", + " \"additionalProperties\": False\n", + " }\n", + "}\n", "\n", - "from pydantic import BaseModel\n", - "from typing import List\n", - "\n", - "class GraderStep(BaseModel):\n", - " description: str\n", - " conclusion: str\n", - "\n", - "class GraderResponse(BaseModel):\n", - " result: float\n", - " steps: List[GraderStep]\n", + "# for completions\n", + "response_format = {\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": response_format\n", + "}\n", "\n", "# Adapted python_model_grader to match the other graders' interface\n", "def python_model_grader(sample, item, model_grader=model_grader_1):\n", @@ -1088,18 +1125,17 @@ " {\"role\": \"user\", \"content\": user_prompt_filled}\n", " ]\n", " # Call the OpenAI API with the grader's model\n", - " response = client.beta.chat.completions.parse(\n", + " response = client.chat.completions.create(\n", " model=model_grader[\"model\"],\n", " messages=messages,\n", " seed=model_grader.get(\"sampling_params\", {}).get(\"seed\", None),\n", " temperature=model_grader.get(\"sampling_params\", {}).get(\"temperature\", 0),\n", - " response_format=GraderResponse,\n", + " response_format=response_format,\n", " )\n", " # Parse the float score from the model's JSON response\n", - " parsed = response.choices[0].message.parsed\n", - " if not isinstance(parsed, GraderResponse):\n", - " raise RuntimeError(f\"Grader returned invalid structured output: {parsed!r}\")\n", - " return float(parsed.result)" + " parsed = json.loads(response.choices[0].message.content)\n", + " \n", + " return float(parsed[\"result\"])" ] }, { @@ -1128,7 +1164,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -1238,7 +1274,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -1300,9 +1336,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Grader validated\n" + ] + } + ], "source": [ "import requests\n", "\n", @@ -1334,9 +1378,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training file detected: data/medical_01_verifiable_problem_train_simple_prompt.jsonl\n", + "Uploading file: data/medical_01_verifiable_problem_train_simple_prompt.jsonl\n", + "File uploaded successfully. File ID: file-19L9jKsJXNJ17DtjvPwN3M\n", + "test file detected: data/medical_01_verifiable_problem_val_simple_prompt.jsonl\n", + "Uploading file: data/medical_01_verifiable_problem_val_simple_prompt.jsonl\n", + "File uploaded successfully. File ID: file-78q2N1QAMKhLiRK3zVB6MC\n" + ] + } + ], "source": [ "# Set your training and test file paths\n", "train_file = \"data/medical_01_verifiable_problem_train_simple_prompt.jsonl\"\n", @@ -1371,7 +1428,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's now define the hyper-parameters for our run. We will be fine-tuning `o4-mini`, with the `medium` reasoning effort. This parameter will impact the length by limiting the number of tokens the model uses to reason. We tune with a moderate compute multiplier and reasonable number of epochs, prioritizing efficiency and fast iteration. You’ll want to tailor these depending on your budget, desired generalization, and dataset difficulty." + "Let's now define the hyper-parameters for our run. We will be fine-tuning `o4-mini`, with the `medium` reasoning effort. This parameter will impact the duration by limiting the number of tokens the model uses to reason. We tune with a moderate compute multiplier and reasonable number of epochs, prioritizing efficiency and fast iteration. Additionally, we set the `eval_samples` parameter to 3 to make the validation curves more robust given the stochasticity of `o4-mini`’s outputs. Averaging across multiple samples reduces noise and helps reveal consistent patterns of learning.\n", + "\n", + "You’ll want to tailor these depending on your budget, desired generalization, and dataset difficulty." ] }, { @@ -1387,9 +1446,9 @@ "n_epochs = 5\n", "seed = 42\n", "grader = model_grader_2\n", - "response_format = None\n", + "response_format_predictions = None\n", "compute_multiplier = 1.0\n", - "eval_samples = 1\n", + "eval_samples = 3\n", "eval_interval = 5" ] }, @@ -1404,7 +1463,16 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training job created with ID: ftjob-tt3B7l45hLUoaXGJRfoL1lLT\n", + "View the job details at: https://platform.openai.com/finetune/ftjob-tt3B7l45hLUoaXGJRfoL1lLT\n" + ] + } + ], "source": [ "# Launch the RFT job\n", "payload = dict(\n", @@ -1416,7 +1484,7 @@ " type=\"reinforcement\",\n", " reinforcement=dict(\n", " grader=grader,\n", - " response_format=response_format,\n", + " response_format=response_format_predictions,\n", " hyperparameters=dict(\n", " compute_multiplier=compute_multiplier,\n", " eval_samples=eval_samples,\n", @@ -1511,55 +1579,54 @@ "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Generating predictions (run 1): 0%| | 0/100 [00:00" ] @@ -1843,7 +1913,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 73, "metadata": {}, "outputs": [ { @@ -1851,32 +1921,32 @@ "output_type": "stream", "text": [ "\n", - "Total mistakes: 80\n", + "Total mistakes: 84\n", "\n", - "[Sample 5]\n", - " Model prediction: carotid duplex ultrasound\n", - " Reference answer: carotid doppler\n", - " Score: 0.5525\n", + "[Sample 9]\n", + " Model prediction: ventilation-perfusion scan\n", + " Reference answer: lung ventilation-perfusion scan\n", + " Score: 0.989\n", "\n", - "[Sample 6]\n", - " Model prediction: under fixation due to insufficient fixation time\n", - " Reference answer: incomplete fixation\n", - " Score: 0.5037037037037037\n", + "[Sample 11]\n", + " Model prediction: autoimmune destruction of melanocytes (vitiligo)\n", + " Reference answer: autoimmune melanocyte destruction\n", + " Score: 0.991\n", "\n", - "[Sample 7]\n", - " Model prediction: acute rheumatic fever due to group a streptococcal pharyngitis mediated by type ii hypersensitivity\n", - " Reference answer: acute rheumatic fever\n", - " Score: 0.85\n", + "[Sample 12]\n", + " Model prediction: contrast enhanced computed tomography of the abdomen\n", + " Reference answer: ct abdomen\n", + " Score: 0.812\n", "\n", - "[Sample 8]\n", - " Model prediction: exposure (open) method of burn treatment\n", - " Reference answer: heterograft application with sutures to secure it in place and daily washes, but no dressing\n", - " Score: 0.3031007751937985\n", + "[Sample 13]\n", + " Model prediction: unfractionated heparin\n", + " Reference answer: enoxaparin\n", + " Score: 0.428\n", "\n", - "[Sample 9]\n", - " Model prediction: beta-lactamase production leading to enzymatic inactivation of ampicillin\n", - " Reference answer: production of beta-lactamase enzyme\n", - " Score: 0.7555555555555555\n" + "[Sample 15]\n", + " Model prediction: t cell–mediated delayed (type iv) hypersensitivity\n", + " Reference answer: th1-mediated cytotoxicity\n", + " Score: 0.932\n" ] } ], @@ -1900,26 +1970,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We see about a 5-point boost in accuracy after fine-tuning. Looking at the first few errors, the model tends to harshly penalize answers that are close but not clinically identical-like *carotid duplex ultrasound* vs. *carotid doppler*. It also dings longer answers, even when they’re correct, like *beta-lactamase production leading to enzymatic inactivation of ampicillin*." + "We see about a 5-point boost in accuracy after fine-tuning. Looking at the first few errors, the model tends to harshly penalize answers that are close but not clinically identical-like *unfractionated heparin* vs. *enoxaparin*. It also dings longer answers, even when they’re correct, like *contrast enhanced computed tomography of the abdomen*." ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "o4-mini-medium-simple-prompt bin counts: [ 4. 15. 9. 7. 7. 4. 3. 5. 22. 24.]\n", - "ftmodel-medium-simple-prompt bin counts: [ 8. 15. 7. 3. 9. 7. 8. 4. 19. 20.]\n", - "Max bin count (y-axis): 24.0\n" + "o4-mini-medium-simple-prompt bin counts: [ 2. 20. 13. 5. 60.]\n", + "ftmodel-medium-simple-prompt bin counts: [ 3. 12. 9. 6. 70.]\n", + "Max bin count (y-axis): 70.0\n" ] }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1934,7 +2004,7 @@ "\n", "# Determine common bins for both histograms\n", "all_scores = scores_o4 + scores_ft\n", - "bins = plt.hist(all_scores, bins=10, alpha=0)[1]\n", + "bins = plt.hist(all_scores, bins=5, alpha=0)[1]\n", "\n", "# Plot histograms and capture the counts\n", "counts_o4, _, _ = plt.hist(\n", @@ -1953,7 +2023,7 @@ "plt.title(\"Model Grader 2 Score Distribution by Model\")\n", "plt.xlabel(\"Score\")\n", "plt.ylabel(\"Count\")\n", - "plt.ylim(top=25)\n", + "plt.ylim(top=75)\n", "plt.legend()\n", "\n", "# Print the bin counts\n", @@ -1966,7 +2036,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Looking at the distruibution of scores, we observe that RFT helped shift the model’s predictions out of the mid-to-low score zone (0.4–0.5) and into the mid-to-high range (0.5–0.6). Since the grader emphasizes clinical similarity over lexical match, this shift reflects stronger medical reasoning-not just better phrasing-according to our *expert* grader. As observed in the 0.9-1.0 range, some verbosity crept in despite mitigations and slightly lowering scores throughout, though it often reflected more complete, semantically aligned answers. A future grader pass could better account for these cases.\n", + "Looking at the distruibution of scores, we observe that RFT helped shift the model’s predictions out of the mid-to-low score zone (0.2-0.6) and into the high range (0.8-1.0). Since the grader emphasizes clinical similarity over lexical match, this shift reflects stronger medical reasoning-not just better phrasing-according to our *expert* grader. As seen in the (0.0-0.1) range, a handful of already weak predictions fell even further, hinting at a residual knowledge gap.\n", "\n", "Note that, because the earlier `combined_grader` was designed to reward lexical correctness, its accuracy didnʼt improve much-which is expected. That gap reinforces why validating your model grader is critical, and why you should monitor for reward-hacking. In our case, we used `o3` to spot-check grading behavior, but domain expert review is essential. " ] @@ -1982,16 +2052,16 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Mean reasoning_tokens_used o4-mini: 424\n", - "Mean reasoning_tokens_used o3: 353\n", - "Mean reasoning_tokens_used ftmodel: 1820\n" + "Mean reasoning_tokens_used o4-mini: 404\n", + "Mean reasoning_tokens_used o3: 384\n", + "Mean reasoning_tokens_used ftmodel: 925\n" ] } ], @@ -2019,46 +2089,45 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Classifying staging type\n", + "**Choosing imaging study**\n", "\n", - "The user provided a clinical scenario of a 35-year-old female with a 5 cm oral tumor and a 2 cm lymph node. They're asking how to stage it according to the TNM classification. This is a diagnosis query, so the correct answer type here is \"diagnosis.\" Considering the tumor's size, it appears to be classified as T3 since it's greater than 4 cm. Thus, I think the staging might be Stage II, but I'll confirm that.\n" + "The user is looking for a single phrase regarding the imaging study for a 49-year-old male with chronic alcohol consumption and related symptoms. I'm considering whether to suggest a CT scan or MRI; however, a CT scan is often the initial choice for chronic pancreatitis. I’ll go with \"abdominal ct scan\" since it's standardized. I need to ensure I format it in lowercase without punctuation, following the user’s request. So the output is \"abdominal ct scan.\"\n" ] } ], "source": [ "from IPython.display import Markdown, display\n", - "markdown_text = results_o4mini_model_grader_2[5][\"summaries\"]\n", + "markdown_text = results_o4mini_model_grader_2[0][30][\"summaries\"]\n", "display(Markdown(markdown_text))" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Clarifying T staging for cancers\n", - "\n", - "I’m digging into T staging for head and neck cancers in the oral cavity. So, T1 applies to tumors 2 cm or less, T2 for those over 2 cm but not more than 4 cm, and T3 is for tumors over 4 cm. T4a indicates invasion into adjacent structures. The patient's tumor measures 5 cm, which is over 4 cm. I’m not sure if it fits T3 or T4a, since T4a involves additional invasiveness, not just size. Determining T and N staging\n", + "**Considering imaging options**\n", "\n", - "I’m looking at a 5 cm tumor in the oral cavity. It seems there’s no mention of invasion into adjacent structures, so I’m categorizing it as T3 due to its size. T4a usually means invasion into structures like bone or skin. According to the TNM classification, since I see no such invasion, T classification remains T3.\n", + "I'm analyzing the user's question about a 49-year-old male with symptoms suggesting steatorrhea, possibly indicating exocrine pancreatic insufficiency from chronic alcohol use. It raises concerns about chronic pancreatitis or pancreatic cancer. I think the best imaging choice is a contrast-enhanced CT scan of the abdomen because it effectively examines structural abnormalities. Alternatively, an endoscopic ultrasound could be more sensitive, but CT is generally preferred. So, my recommendation is to start with a contrast-enhanced CT scan.\n", + "**Determining the appropriate imaging study**\n", "\n", - "Moving on to N staging, I see there's a single lymph node of 2 cm on the same side; this fits the N1 classification for metastasis, as it’s less than 3 cm.\n" + "I'm analyzing the question about the most suitable imaging study for a patient with symptoms suggesting chronic pancreatitis. The standard approach for suspected chronic pancreatitis is a contrast-enhanced CT scan of the abdomen, as it effectively identifies pancreatic calcifications and structural changes. While MRCP and endoscopic ultrasound provide additional details, CT is often preferred as the initial test. Therefore, my answer should focus on recommending a \"contrast-enhanced abdominal CT\" as the next step in evaluation.\n" ] } ], "source": [ - "markdown_text = results_ft_model_grader_2[5][\"summaries\"]\n", + "markdown_text = results_ft_model_grader_2[0][30][\"summaries\"]\n", "display(Markdown(markdown_text))" ] }, @@ -2066,7 +2135,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Base `o4-mini`'s reasoning gives a quick answer but doesn’t explain how it got there. It mentions the tumor size but doesn’t walk through the actual TNM rules, and it seems unsure about the result. On the other hand, the `finetuned model` is more thoughtful - breaking down the T and N staging step by step and explaining why each part applies. The latter seems more careful, and seems to have learnt to break down the case description even more." + "Base `o4‑mini`’s reasoning zooms straight to “abdominal CT scan,” mostly worrying about lowercase formatting and giving only a cursory “often the initial choice” justification. The `finetuned model`, meanwhile, first links the patient’s steatorrhea and alcohol history to chronic pancreatitis or cancer, weighs CT against MRCP and EUS, and explains why a contrast‑enhanced abdominal CT best reveals calcifications and structural change. The latter seems more careful, and seems to have learnt to break down the case description even more." ] }, { diff --git a/images/rft_dashboard_modelgrader2.png b/images/rft_dashboard_modelgrader2.png index 731c38c9af..ad9213d64d 100644 Binary files a/images/rft_dashboard_modelgrader2.png and b/images/rft_dashboard_modelgrader2.png differ