Skip to content

Commit 1c20860

Browse files
committed
Add ToC LLM example
1 parent 9c4c6a9 commit 1c20860

File tree

4 files changed

+232
-0
lines changed

4 files changed

+232
-0
lines changed

samples/resources/pdfs/nist.pdf

346 KB
Binary file not shown.

samples/sandbox/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
# Sample results
22
*.pdf
33
*.txt
4+
5+
# Requirements for some of the samples
6+
!requirements.txt

samples/sandbox/ai/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
openai~=1.75.0
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
#
2+
# This sample shows an example of generating table of contents for existing
3+
# PDF documents with the help of an LLM with an OpenAI-compatible API.
4+
#
5+
# Before running the sample, make sure, that the `openai` package is installed
6+
# and that the model you are running has a big enough context window to fit the
7+
# whole input document.
8+
#
9+
# Default settings are set to use a Qwen 2.5 model on a local Ollama instance.
10+
# The default Ollama context window is small, so make sure to change it using
11+
# the instructions below.
12+
#
13+
import itextpy
14+
itextpy.load()
15+
16+
from itextpy.util import disposing
17+
18+
from pathlib import Path
19+
20+
from openai import OpenAI
21+
22+
from iText.Kernel.Geom import PageSize
23+
from iText.Kernel.Pdf.Canvas.Parser import PdfTextExtractor
24+
from iText.Kernel.Pdf.Canvas.Parser.Listener import LocationTextExtractionStrategy
25+
from iText.Kernel.Pdf import PdfOutline, PdfReader, PdfWriter, PdfDocument
26+
from iText.Kernel.Pdf.Action import PdfAction
27+
from iText.Kernel.Pdf.Canvas.Draw import DottedLine
28+
from iText.Kernel.Pdf.Navigation import PdfExplicitDestination
29+
from iText.Layout import Canvas
30+
from iText.Layout.Element import List, ListItem, Paragraph, Tab, TabStop
31+
from iText.Layout.Properties import ListNumberingType, TabAlignment
32+
33+
SCRIPT_DIR = Path(__file__).parent.absolute()
34+
RESOURCES_DIR = SCRIPT_DIR / ".." / ".." / "resources"
35+
INPUT_PDF_PATH = str(RESOURCES_DIR / "pdfs" / "nist.pdf")
36+
37+
#
38+
# This block defines connection to the LLM you want to use. Default values
39+
# will make the script connect to a local Ollama instance with a Qwen 2.5
40+
# model available to it.
41+
#
42+
# If you are going to use Ollama, make sure that you increase the context
43+
# windows size, as the default value is very small. With the default value
44+
# there is a high chance, that the document won't fit into the context window,
45+
# which will result in an incomplete or broken table.
46+
#
47+
# Easiest way to increase the context window is to set env var
48+
# OLLAMA_CONTEXT_LENGTH to a big enough value (like 65536) before starting
49+
# Ollama. Also make sure, that Ollama is version 0.5.13 or above. See this page
50+
# for more info:
51+
# https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-specify-the-context-window-size
52+
#
53+
OPENAI_BASE_URL = "http://localhost:11434/v1" # Default local Ollama URL
54+
OPENAI_API_KEY = "EMPTY" # No API key needed for Ollama
55+
OPENAI_MODEL = "qwen2.5"
56+
57+
58+
# This is a tree-like structure for storing table of contents data. Root is a
59+
# node without a caption.
60+
class TableEntry:
61+
def __init__(self, caption: str | None = None):
62+
self.caption = caption
63+
self.children = []
64+
self.page_idx = None
65+
66+
def __str__(self):
67+
return self.caption
68+
69+
def __iter__(self):
70+
if self.caption is not None:
71+
yield self
72+
for child in self.children:
73+
for node in child:
74+
yield node
75+
76+
77+
# This function converts pages of the PDF document to text.
78+
def get_pages_as_text(doc: PdfDocument) -> list[str]:
79+
text_extractor = PdfTextExtractor()
80+
# This is the default strategy, which works relatively fine. But you might
81+
# get a better result with a custom one, which tries to preserve spacial
82+
# data with whitespace.
83+
strategy = LocationTextExtractionStrategy()
84+
pages = (doc.GetPage(i + 1) for i in range(doc.GetNumberOfPages()))
85+
return [text_extractor.GetTextFromPage(page, strategy) for page in pages]
86+
87+
88+
# This function parses the ToC LLM response into a tree-like structure.
89+
def parse_response(response_lines: list[str]) -> TableEntry:
90+
result = TableEntry()
91+
for line in response_lines:
92+
# We expect strings like "1.2.3 Chapter" here
93+
index_str, caption = line.strip().split(" ", maxsplit=1)
94+
index_seq = [int(i) for i in index_str.split(".") if i]
95+
entry_list = result.children
96+
for index in index_seq[:-1]:
97+
entry_list = entry_list[index - 1].children
98+
pos = index_seq[-1] - 1
99+
if pos != len(entry_list):
100+
raise Exception("Unexpected index value")
101+
entry_list.append(TableEntry(caption))
102+
return result
103+
104+
105+
# This function augments the table of contents with page numbers for each
106+
# entry.
107+
#
108+
# Since LLM is not that reliable with giving a precise location for entries,
109+
# we will assume, that we can find them in the original text and recover the
110+
# page data that way. This is not ideal, but work relatively well in practice.
111+
def add_page_data(toc_data: TableEntry, pages: list[str]) -> None:
112+
casefold_pages = [p.casefold() for p in pages]
113+
prev_page_idx = 0
114+
prev_str_idx = 0
115+
for node in toc_data:
116+
for page_offset, page in enumerate(casefold_pages[prev_page_idx:]):
117+
casefold_caption = node.caption.casefold()
118+
str_idx = page.find(casefold_caption, prev_str_idx if page_offset == 0 else 0)
119+
if str_idx != -1:
120+
prev_page_idx += page_offset
121+
prev_str_idx = str_idx + len(casefold_caption)
122+
break
123+
node.page_idx = prev_page_idx
124+
125+
126+
# This function ask the LLM to generate the table of contents for the provided
127+
# pages of text. The result gets parsed into a tree-like structure.
128+
def generate_toc_data(pages: list[str]) -> TableEntry:
129+
openai_client = OpenAI(
130+
base_url=OPENAI_BASE_URL,
131+
api_key=OPENAI_API_KEY,
132+
)
133+
messages = [
134+
{"role": "system", "content": "You are a helpful assistant."},
135+
{"role": "user", "content": "You are about to read text of a PDF document:"},
136+
{"role": "user", "content": "\n\n".join(pages)},
137+
{"role": "user", "content": "Generate a numbered table of contents for the document. "
138+
"Write only the table entries."},
139+
]
140+
response = openai_client.chat.completions.create(
141+
model=OPENAI_MODEL,
142+
messages=messages,
143+
temperature=0.1,
144+
)
145+
response_content = response.choices[0].message.content
146+
toc_data = parse_response(response_content.splitlines())
147+
add_page_data(toc_data, pages)
148+
return toc_data
149+
150+
151+
# This function just recursively generates the Bookmarks tree for the
152+
# resulting PDF document.
153+
def fill_outline(doc: PdfDocument, outline_root: PdfOutline, toc_children: list[TableEntry]) -> None:
154+
for entry in toc_children:
155+
page_num = entry.page_idx + 1
156+
outline = outline_root.AddOutline(entry.caption)
157+
# +1 since we added the ToC page at the start
158+
outline.AddDestination(PdfExplicitDestination.CreateFit(doc.GetPage(page_num + 1)))
159+
if entry.children:
160+
fill_outline(doc, outline, entry.children)
161+
162+
163+
# This function recursively generates a table of contents, using numbered list
164+
# from the iText layout engine.
165+
def generate_list(doc: PdfDocument, tab_stops: list[TabStop], toc_children: list[TableEntry]) -> List:
166+
l = List(ListNumberingType.DECIMAL)
167+
for entry in toc_children:
168+
page_num = entry.page_idx + 1
169+
# +1 since we added the ToC page at the start
170+
page_dest = PdfExplicitDestination.CreateFit(doc.GetPage(page_num + 1))
171+
p = (Paragraph()
172+
.SetMargin(2)
173+
.SetFontSize(12)
174+
.AddTabStops(tab_stops)
175+
.Add(entry.caption)
176+
.Add(Tab())
177+
.Add(str(page_num))
178+
.SetAction(PdfAction.CreateGoTo(page_dest)))
179+
item = ListItem()
180+
item.Add(p)
181+
if entry.children:
182+
item.Add(generate_list(doc, tab_stops, entry.children))
183+
l.Add(item)
184+
return l
185+
186+
187+
# This function creates a page with the table of contents and prepends it to
188+
# the PDF document. If there are no bookmarks present, then they will be added
189+
# too.
190+
#
191+
# This function at the moment assumes, that everything will fit into one page.
192+
def add_toc_to_doc(doc: PdfDocument, toc_data: TableEntry) -> None:
193+
page_size = PageSize(doc.GetPage(1).GetPageSize())
194+
toc_page = doc.AddNewPage(1, page_size)
195+
content_box = (toc_page.GetCropBox()
196+
.ApplyMargins(36, 36, 36, 36, False))
197+
with disposing(Canvas(toc_page, content_box)) as toc_canvas:
198+
header = (Paragraph("Table of Contents")
199+
.SetFontSize(24))
200+
toc_canvas.Add(header)
201+
page_num_tab_stop = TabStop(content_box.GetWidth(), TabAlignment.RIGHT, DottedLine())
202+
toc_canvas.Add(generate_list(doc, [page_num_tab_stop], toc_data.children))
203+
204+
# Now adding bookmarks as well
205+
outline_root = doc.GetOutlines(True)
206+
# Do not ruin existing bookmarks...
207+
if len(outline_root.GetAllChildren()) == 0:
208+
fill_outline(doc, doc.GetOutlines(True), toc_data.children)
209+
210+
211+
# The algorithm is pretty straightforward:
212+
# 1. Convert PDF document pages to text.
213+
# 2. Send pages to an LLM and ask it to generate the table of contents.
214+
# 3. Parse the LLM response.
215+
# 4. Generate and add the table of contents page, together with bookmarks.
216+
def main(in_path: str, out_path: str) -> None:
217+
with disposing(PdfDocument(PdfReader(in_path), PdfWriter(out_path))) as doc:
218+
pages = get_pages_as_text(doc)
219+
if not pages:
220+
raise Exception("Document is empty!")
221+
toc_data = generate_toc_data(pages)
222+
if not toc_data.children:
223+
raise Exception("Table of Contents is empty!")
224+
add_toc_to_doc(doc, toc_data)
225+
226+
227+
# Call the function to create a PDF
228+
main(INPUT_PDF_PATH, "table_of_contents.pdf")

0 commit comments

Comments
 (0)