Skip to content

Commit 4d49017

Browse files
authored
Merge pull request #1762 from oracle-devrel/ao-langchain-CVAnalysis
New Asset (Multi-modal document extraction using Llama 4 models)
2 parents 26e8a92 + ac7e0ae commit 4d49017

File tree

6 files changed

+355
-0
lines changed

6 files changed

+355
-0
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Multi-modal Document Extraction
2+
3+
*This Generative AI service application relies on OCI SDK alongside the new Llama 4 models (Scout and Maverick) to extract data from PDFs (or images) into structured data as JSON.*
4+
5+
Reviewed: 20.05.2025
6+
7+
# When to use this asset?
8+
9+
Developers, data scientists, or ML engineers who need to extract structured JSON from invoices or other document images and want to compare the performance of the new Llama 4 OCI vision models.
10+
11+
# How to use this asset?
12+
13+
1. Open the Streamlit app
14+
2. Upload a PDF or image file
15+
3. In the sidebar, select either **meta.llama-4-scout-17b-16e-instruct** or **meta.llama-4-maverick-17b-128e-instruct-fp8**
16+
4. Wait for processing—JSON output will be displayed when finished
17+
18+
# Setup
19+
20+
To get started, clone the repository, install dependencies, and launch the app:
21+
22+
```bash
23+
git clone <repository-url>
24+
cd <repository-folder>
25+
pip install -r requirements.txt
26+
streamlit run <file_name>.py
27+
```
28+
29+
# Useful Links (Optional)
30+
31+
* [More information on Llama 4 ](https://confluence.oraclecorp.com/confluence/display/EMEACSS/FAQ+for+Generative+AI+Service)
32+
33+
* [Pretrained Foundational Models in Generative AI](https://docs.oracle.com/en-us/iaas/Content/generative-ai/pretrained-models.htm)
34+
35+
# License
36+
37+
Copyright (c) 2025 Oracle and/or its affiliates.
38+
39+
Licensed under the Universal Permissive License (UPL), Version 1.0.
40+
41+
See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
"""
2+
config
3+
"""
4+
5+
6+
compartment_id = "ocid1.compartment.oc1..aaaaaaaaoi33ny4fvy2nxlrbkn5l2t6sw6yuy5tats7iipnb5hz6jmylqqnq"
7+
service_endpoint = "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com"
8+
vision_models = ["meta.llama-4-scout-17b-16e-instruct", "meta.llama-4-maverick-17b-128e-instruct-fp8"]
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
"""
2+
Simple streamlit UI for comparison of vision models
3+
4+
Converts PDF to image -> Extracts all data into a JSON
5+
6+
Available models:
7+
- Llama 4 Scout
8+
- Llama 4 Maverick
9+
10+
Author - Ali Ottoman
11+
"""
12+
13+
import io
14+
import base64
15+
import oci
16+
from pdf2image import convert_from_bytes
17+
import streamlit as st
18+
from oci_models import get_llm
19+
from prompt import OVERALL_PROMPT
20+
from config import compartment_id, vision_models
21+
22+
23+
# ─── LLM Creation ─────────────────────────────────────────────────────────────
24+
llm_client = get_llm()
25+
26+
# ─── Helper Functions ─────────────────────────────────────────────────────────────
27+
def save_images(images, output_format="JPEG"):
28+
"""
29+
Saves images locally for processing
30+
"""
31+
image_list = []
32+
for image in images:
33+
img_byte_arr = io.BytesIO()
34+
image.save(img_byte_arr, format=output_format)
35+
img_byte_arr.seek(0)
36+
image_list.append(img_byte_arr)
37+
return image_list
38+
39+
def encode_image(image_path):
40+
"""
41+
Encodes an image to base64 format.
42+
"""
43+
with open(image_path, "rb") as image_file:
44+
return base64.b64encode(image_file.read()).decode("utf-8")
45+
46+
def make_user_message(prompt: str, b64img: str):
47+
"""
48+
Builds UserMessage with text + image
49+
"""
50+
# Text part
51+
txt = oci.generative_ai_inference.models.TextContent()
52+
txt.text = prompt
53+
54+
# Image part
55+
img = oci.generative_ai_inference.models.ImageContent()
56+
url = oci.generative_ai_inference.models.ImageUrl()
57+
url.url = f"data:image/jpeg;base64,{b64img}"
58+
img.image_url = url
59+
60+
msg = oci.generative_ai_inference.models.UserMessage()
61+
msg.content = [txt, img]
62+
return msg
63+
64+
def call_vision_model(frame, prompt: str, vision_model: str):
65+
"""
66+
Assemble and send the chat request
67+
"""
68+
user_msg = make_user_message(prompt, frame)
69+
70+
# GenericChatRequest
71+
chat_req = oci.generative_ai_inference.models.GenericChatRequest(
72+
messages = [user_msg],
73+
api_format = oci.generative_ai_inference.models.BaseChatRequest.API_FORMAT_GENERIC,
74+
num_generations = 1,
75+
is_stream = False,
76+
temperature = 0.5,
77+
top_p = 0.7,
78+
top_k = -1,
79+
frequency_penalty = 1.0
80+
)
81+
82+
details = oci.generative_ai_inference.models.ChatDetails(
83+
serving_mode = oci.generative_ai_inference.models.OnDemandServingMode(model_id=vision_model),
84+
compartment_id = compartment_id,
85+
chat_request = chat_req
86+
)
87+
88+
# Invoke the model
89+
resp = llm_client.chat(details)
90+
return resp.data.chat_response.choices[0].message.content[0].text
91+
92+
# ─── Main Function ─────────────────────────────────────────────────────────────
93+
def main():
94+
"""
95+
Streamlit UI and model selection + Running & outputting the JSON
96+
"""
97+
st.title("Model Comparison")
98+
99+
uploaded_image = st.file_uploader("Upload image here")
100+
101+
prompt = OVERALL_PROMPT
102+
103+
with st.sidebar:
104+
st.subheader("Select your model for comparison")
105+
vision_model = st.selectbox("Choose your model:", vision_models)
106+
if uploaded_image is not None:
107+
with st.spinner("Processing..."):
108+
if uploaded_image.type == "application/pdf":
109+
images = convert_from_bytes(uploaded_image.read(), fmt="jpeg")
110+
else:
111+
images = [convert_from_bytes(uploaded_image.read(), fmt="jpeg")[0]]
112+
113+
image_list = save_images(images)
114+
115+
encoded_frame = base64.b64encode(image_list[0].getvalue()).decode("utf-8")
116+
117+
result = call_vision_model(encoded_frame, prompt, vision_model)
118+
st.write(result)
119+
120+
# ────────────────────────────────────────────────────────────────
121+
if __name__ == "__main__":
122+
main()
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""
2+
This module provides a function to initialize LLM
3+
4+
Return an instance of the OCI GenAI language model.
5+
6+
Author: Ali Ottoman
7+
"""
8+
# ─── Imports ────────────────────────────────────────────────────────────────────
9+
import oci
10+
from config import service_endpoint
11+
12+
13+
# ─── Configuration ─────────────────────────────────────────────────────────────
14+
config = oci.config.from_file("~/.oci/config", "DEFAULT")
15+
16+
def get_llm():
17+
"""
18+
Initialize and return an instance of ChatOCIGenAI with the specified configuration.
19+
20+
Returns:
21+
ChatOCIGenAI: An instance of the OCI GenAI language model.
22+
"""
23+
llm = oci.generative_ai_inference.GenerativeAiInferenceClient(
24+
config=config,
25+
service_endpoint=service_endpoint,
26+
retry_strategy=oci.retry.NoneRetryStrategy(),
27+
timeout=(10, 240)
28+
)
29+
return llm
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
"""
2+
This file contains various prompt templates for different invoice parsing tasks.
3+
"""
4+
5+
OVERALL_PROMPT = """
6+
You are a high-precision invoice parser.
7+
When given an image of an invoice, you will:
8+
9+
1. Detect all section headers on the invoice.
10+
- A header is any line in larger or bold font, or followed by a blank line, colon, or underline.
11+
12+
2. Extract the content under each header until the next header or end of document.
13+
- Key–Value blocks: single lines or small blocks → JSON properties.
14+
- Tables: first row as column headers (snake_case) → array of objects.
15+
- Multi-line notes: join lines with spaces.
16+
17+
3. For monetary fields, strip symbols/codes and output two properties:
18+
- <field_name> (number)
19+
- <field_name>_currency (string)
20+
21+
4. General rules:
22+
- DO NOT output anything other than the valid JSON—no markdown, NO extra text.
23+
- Use null for missing values.
24+
- Dates must be ISO 8601 (YYYY-MM-DD).
25+
26+
Example:
27+
{
28+
"company_info": {
29+
"name": "Oman Insurance Management Services Ltd.",
30+
"address": "Unit 407, Level 4, Gate District 03, DIFC, Dubai, United Arab Emirates",
31+
"reference": "KFM97956124-E6",
32+
"date": "2024-11-29"
33+
},
34+
"attention_to": null,
35+
"credit_note": "Endorsement #6 HANMIR",
36+
"reinsured": {
37+
"name": "Hanwha General Insurance Co., Ltd. (Korean Reinsurance Company)"
38+
},
39+
"original_insured": "KOREA INSTITUTE OF MARITIME AND FISHERIES TECHNOLOGY (OWNER & MANAGER)",
40+
"insurance_covers": "Hull Facultative Reinsurance",
41+
"policy_no": null,
42+
"insurance_period": "One year as from 2024-04-01",
43+
"Line Items":
44+
{
45+
"description": "Premium",
46+
"amount": 12345.67,
47+
"amount_currency": "KRW"
48+
},
49+
"Order Hereon": {
50+
"percentage": "7.5%",
51+
"amount": 131,797,
52+
"amount_currency": "KRW",
53+
54+
}
55+
// …additional rows if present
56+
]
57+
}
58+
"""
59+
60+
GENERIC_PROMPT = """
61+
Extract the following details and provide the response only in valid JSON format (no extra explanation or text):
62+
- **Debit / Credit Note No.**
63+
- **Policy Period**
64+
- **Insured**
65+
- **Vessel Name**
66+
- **Details**
67+
- **Currency**
68+
- **Gross Premium 100%**
69+
- **OIMSL Share**
70+
- **Total Deductions**
71+
- **Net Premium**
72+
- **Premium Schedule**
73+
- **Installment Amount**
74+
75+
Ensure the extracted data is formatted correctly as JSON and include nothing else at all in the response, not even a greeting or closing.
76+
77+
For example:
78+
79+
"Debit / Credit Note No.": "296969",
80+
"Policy Period": "Feb 20, 2024 to Jul 15, 2025",
81+
"Insured": "Stealth Maritime Corp. S.A.",
82+
"Vessel Name": "SUPRA DUKE - HULL & MACHINERY", (Make sure this is the entire vessel name only)
83+
"Details": "SUPRA DUKE - Original Premium",
84+
"Currency": "USD",
85+
"Gross Premium 100%": 56973.63,
86+
"OIMSL Share": 4557.89,
87+
"Total Deductions": 979.92,
88+
"Net Premium": 3577.97,
89+
"Premium Schedule": ["Apr 20, 2024", "Jun 14, 2024", "Sep 13, 2024", "Dec 14, 2024", "Mar 16, 2025", "Jun 14, 2025"],
90+
"Installment Amount": [372.87, 641.02, 641.02, 641.02, 641.02, 641.02]
91+
92+
)" ensure your response is a system prompt format with an example of what the ouput should look like. Also ensure to mention in your gernerated prompt that no other content whatsover should appear except the JSON
93+
"""
94+
95+
NHS_PROMPT = """
96+
You are a high-precision invoice parser.
97+
When given an invoice (image, PDF, or text), produce **one** valid JSON object with exactly the following fields, in this order:
98+
99+
1. invoice_number (string)
100+
2. account_reference (string)
101+
3. issue_date (ISO 8601 date: YYYY-MM-DD)
102+
4. due_date (ISO 8601 date: YYYY-MM-DD)
103+
5. supplier_name (string)
104+
6. supplier_address (string)
105+
7. VAT_registration_number (string)
106+
8. total_amount (number)
107+
9. currency (string)
108+
10. vat_amount (number)
109+
11. line_items (array of objects), each with:
110+
- description (string)
111+
- quantity (string)
112+
- unit_price (number)
113+
- total (number)
114+
115+
**Rules:**
116+
- **Output only** the JSON—no markdown, no extra text.
117+
- Use `null` for any missing values.
118+
- Dates **must** be in ISO 8601 (YYYY-MM-DD).
119+
- Numeric fields must omit symbols and separators (e.g. `1500.0`, not “$1,500”).
120+
- Preserve the array structure for `line_items` even if empty.
121+
122+
**Example:**
123+
```json
124+
{
125+
"invoice_number": "INV-1001",
126+
"account_reference": "AR-2024",
127+
"issue_date": "2024-05-18",
128+
"due_date": "2024-06-18",
129+
"supplier_name": "Acme Corporation",
130+
"supplier_address": "123 Main St, Anytown, Country",
131+
"VAT_registration_number": "GB123456789",
132+
"total_amount": 1500.0,
133+
"currency": "GBP",
134+
"vat_amount": 300.0,
135+
"line_items": [
136+
{
137+
"description": "Widget A",
138+
"quantity": "10",
139+
"unit_price": 50.0,
140+
"total": 500.0
141+
},
142+
{
143+
"description": "Widget B",
144+
"quantity": "20",
145+
"unit_price": 50.0,
146+
"total": 1000.0
147+
}
148+
]
149+
}
150+
"""
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
langchain_community==0.3.24
2+
langchain_core==0.3.59
3+
pdf2image==1.17.0
4+
streamlit==1.41.0
5+
oci==2.150.3

0 commit comments

Comments
 (0)