Skip to content

Commit 341d3fc

Browse files
authored
[feat] Add pdf loader (Layout-Parser#71)
* reorganize folders * automatically setting ids * Add pdf loading functionality * add pdf tests * add deps * Add load_pdf to init * add load_pdf to doc * Add copyright
1 parent c0044a0 commit 341d3fc

File tree

8 files changed

+252
-11
lines changed

8 files changed

+252
-11
lines changed

docs/api_doc/io.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,28 @@ Load and Export Layout Data
22
================================
33

44

5-
DataFrame and CSV
5+
`Dataframe` and CSV
66
--------------------------------
77

88
.. autofunction:: layoutparser.io.load_dataframe
99

1010
.. autofunction:: layoutparser.io.load_csv
1111

1212

13-
Dictionary and JSON
13+
`Dict` and JSON
1414
--------------------------------
1515

1616
.. autofunction:: layoutparser.io.load_dict
1717

1818
.. autofunction:: layoutparser.io.load_json
1919

2020

21+
PDF
22+
--------------------------------
23+
24+
.. autofunction:: layoutparser.io.load_pdf
25+
26+
2127
Other Formats
2228
--------------------------------
2329
Stay tuned! We are working on to support more formats.

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
"pillow",
4242
"pyyaml>=5.1",
4343
"iopath",
44+
"pdfplumber",
45+
"pdf2image",
4446
],
4547
extras_require={
4648
"ocr": [

src/layoutparser/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@
4141
"load_json",
4242
"load_dict",
4343
"load_csv",
44-
"load_dataframe"
44+
"load_dataframe",
45+
"load_pdf"
4546
],
4647
"file_utils":[
4748
"is_torch_available",

src/layoutparser/io/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .basic import load_json, load_dict, load_csv, load_dataframe
2+
from .pdf import load_pdf

src/layoutparser/io.py renamed to src/layoutparser/io/basic.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,8 @@
1818

1919
import pandas as pd
2020

21-
from .elements import (
22-
BaseCoordElement,
21+
from ..elements import (
2322
BaseLayoutElement,
24-
Interval,
25-
Rectangle,
26-
Quadrilateral,
2723
TextBlock,
2824
Layout,
2925
BASECOORD_ELEMENT_NAMEMAP,
@@ -144,4 +140,7 @@ def load_dataframe(df: pd.DataFrame, block_type: str = None) -> Layout:
144140
else:
145141
df["block_type"] = block_type
146142

143+
if "id" not in df.columns:
144+
df["id"] = df.index
145+
147146
return load_dict(df.apply(lambda x: x.dropna().to_dict(), axis=1).to_list())

src/layoutparser/io/pdf.py

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
# Copyright 2021 The Layout Parser team. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from typing import List, Union, Optional, Dict, Tuple
16+
17+
import pdfplumber
18+
import pandas as pd
19+
20+
from ..elements import Layout
21+
from .basic import load_dataframe
22+
23+
DEFAULT_PDF_DPI = 72
24+
25+
26+
def extract_words_for_page(
27+
page: pdfplumber.page.Page,
28+
x_tolerance=1.5,
29+
y_tolerance=2,
30+
keep_blank_chars=False,
31+
use_text_flow=True,
32+
horizontal_ltr=True,
33+
vertical_ttb=True,
34+
extra_attrs=None,
35+
) -> Layout:
36+
"""The helper function used for extracting words from a pdfplumber page
37+
object.
38+
39+
Returns:
40+
Layout: a layout object representing all extracted pdf tokens on this page.
41+
"""
42+
if extra_attrs is None:
43+
extra_attrs = ["fontname", "size"]
44+
45+
tokens = page.extract_words(
46+
x_tolerance=x_tolerance,
47+
y_tolerance=y_tolerance,
48+
keep_blank_chars=keep_blank_chars,
49+
use_text_flow=use_text_flow,
50+
horizontal_ltr=horizontal_ltr,
51+
vertical_ttb=vertical_ttb,
52+
extra_attrs=extra_attrs,
53+
)
54+
55+
df = pd.DataFrame(tokens)
56+
df[["x0", "x1"]] = (
57+
df[["x0", "x1"]].clip(lower=0, upper=int(page.width)).astype("float")
58+
)
59+
df[["top", "bottom"]] = (
60+
df[["top", "bottom"]].clip(lower=0, upper=int(page.height)).astype("float")
61+
)
62+
63+
page_tokens = load_dataframe(
64+
df.rename(
65+
columns={
66+
"x0": "x_1",
67+
"x1": "x_2",
68+
"top": "y_1",
69+
"bottom": "y_2",
70+
"fontname": "type", # also loading fontname as "type"
71+
}
72+
),
73+
block_type="rectangle",
74+
)
75+
76+
return page_tokens
77+
78+
79+
def load_pdf(
80+
filename: str,
81+
load_images: bool = False,
82+
x_tolerance: int = 1.5,
83+
y_tolerance: int = 2,
84+
keep_blank_chars: bool = False,
85+
use_text_flow: bool = True,
86+
horizontal_ltr: bool = True,
87+
vertical_ttb: bool = True,
88+
extra_attrs: Optional[List[str]] = None,
89+
dpi: int = DEFAULT_PDF_DPI,
90+
) -> Union[List[Layout], Tuple[List[Layout], List["Image.Image"]]]:
91+
"""Load all tokens for each page from a PDF file, and save them
92+
in a list of Layout objects with the original page order.
93+
94+
Args:
95+
filename (str): The path to the PDF file.
96+
load_images (bool, optional):
97+
Whether load screenshot for each page of the PDF file.
98+
When set to true, the function will return both the layout and
99+
screenshot image for each page.
100+
Defaults to False.
101+
x_tolerance (int, optional):
102+
The threshold used for extracting "word tokens" from the pdf file.
103+
It will merge the pdf characters into a word token if the difference
104+
between the x_2 of one character and the x_1 of the next is less than
105+
or equal to x_tolerance. See details in `pdf2plumber's documentation
106+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
107+
Defaults to 1.5.
108+
y_tolerance (int, optional):
109+
The threshold used for extracting "word tokens" from the pdf file.
110+
It will merge the pdf characters into a word token if the difference
111+
between the y_2 of one character and the y_1 of the next is less than
112+
or equal to y_tolerance. See details in `pdf2plumber's documentation
113+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
114+
Defaults to 2.
115+
keep_blank_chars (bool, optional):
116+
When keep_blank_chars is set to True, it will treat blank characters
117+
are treated as part of a word, not as a space between words. See
118+
details in `pdf2plumber's documentation
119+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
120+
Defaults to False.
121+
use_text_flow (bool, optional):
122+
When use_text_flow is set to True, it will use the PDF's underlying
123+
flow of characters as a guide for ordering and segmenting the words,
124+
rather than presorting the characters by x/y position. (This mimics
125+
how dragging a cursor highlights text in a PDF; as with that, the
126+
order does not always appear to be logical.) See details in
127+
`pdf2plumber's documentation
128+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
129+
Defaults to True.
130+
horizontal_ltr (bool, optional):
131+
When horizontal_ltr is set to True, it means the doc should read
132+
text from left to right, vice versa.
133+
Defaults to True.
134+
vertical_ttb (bool, optional):
135+
When vertical_ttb is set to True, it means the doc should read
136+
text from top to bottom, vice versa.
137+
Defaults to True.
138+
extra_attrs (Optional[List[str]], optional):
139+
Passing a list of extra_attrs (e.g., ["fontname", "size"]) will
140+
restrict each words to characters that share exactly the same
141+
value for each of those `attributes extracted by pdfplumber
142+
<https://github.com/jsvine/pdfplumber/blob/develop/README.md#char-properties>`_,
143+
and the resulting word dicts will indicate those attributes.
144+
See details in `pdf2plumber's documentation
145+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
146+
Defaults to `["fontname", "size"]`.
147+
dpi (int, optional):
148+
When loading images of the pdf, you can also specify the resolution
149+
(or `DPI, dots per inch <https://en.wikipedia.org/wiki/Dots_per_inch>`_)
150+
for rendering the images. Higher DPI values mean clearer images (also
151+
larger file sizes).
152+
Setting dpi will also automatically resizes the extracted pdf_layout
153+
to match the sizes of the images. Therefore, when visualizing the
154+
pdf_layouts, it can be rendered appropriately.
155+
Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
156+
from the pdfplumber PDF parser.
157+
158+
Returns:
159+
List[Layout]:
160+
When `load_images=False`, it will only load the pdf_tokens from
161+
the PDF file. Each element of the list denotes all the tokens appeared
162+
on a single page, and the list is ordered the same as the original PDF
163+
page order.
164+
Tuple[List[Layout], List["Image.Image"]]:
165+
When `load_images=True`, besides the `all_page_layout`, it will also
166+
return a list of page images.
167+
168+
Examples::
169+
>>> import layoutparser as lp
170+
>>> pdf_layout = lp.load_pdf("path/to/pdf")
171+
>>> pdf_layout[0] # the layout for page 0
172+
>>> pdf_layout, pdf_images = lp.load_pdf("path/to/pdf", load_images=True)
173+
>>> lp.draw_box(pdf_images[0], pdf_layout[0])
174+
"""
175+
176+
plumber_pdf_object = pdfplumber.open(filename)
177+
178+
all_page_layout = []
179+
for page_id in range(len(plumber_pdf_object.pages)):
180+
cur_page = plumber_pdf_object.pages[page_id]
181+
182+
page_tokens = extract_words_for_page(
183+
cur_page,
184+
x_tolerance=x_tolerance,
185+
y_tolerance=y_tolerance,
186+
keep_blank_chars=keep_blank_chars,
187+
use_text_flow=use_text_flow,
188+
horizontal_ltr=horizontal_ltr,
189+
vertical_ttb=vertical_ttb,
190+
extra_attrs=extra_attrs,
191+
)
192+
193+
# Adding metadata for the current page
194+
page_tokens.page_data["width"] = float(cur_page.width)
195+
page_tokens.page_data["height"] = float(cur_page.height)
196+
page_tokens.page_data["index"] = page_id
197+
198+
all_page_layout.append(page_tokens)
199+
200+
if not load_images:
201+
return all_page_layout
202+
else:
203+
import pdf2image
204+
205+
pdf_images = pdf2image.convert_from_path(filename, dpi=dpi)
206+
207+
for page_id, page_image in enumerate(pdf_images):
208+
image_width, image_height = page_image.size
209+
page_layout = all_page_layout[page_id]
210+
layout_width = page_layout.page_data["width"]
211+
layout_height = page_layout.page_data["height"]
212+
if image_width != layout_width or image_height != layout_height:
213+
scale_x = image_width / layout_width
214+
scale_y = image_height / layout_height
215+
page_layout = page_layout.scale((scale_x, scale_y))
216+
page_layout.page_data["width"] = image_width
217+
page_layout.page_data["height"] = image_height
218+
all_page_layout[page_id] = page_layout
219+
220+
return all_page_layout, pdf_images

tests/fixtures/io/example.pdf

216 KB
Binary file not shown.

tests/test_io.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414

1515
import numpy as np
1616
from layoutparser.elements import Interval, Rectangle, Quadrilateral, TextBlock, Layout
17-
from layoutparser.io import load_json, load_dict, load_csv
18-
17+
from layoutparser import load_json, load_dict, load_csv, load_pdf
1918

2019
def test_json():
2120

@@ -67,4 +66,16 @@ def test_csv():
6766
l2 = Layout([i2, r2, q2])
6867

6968
_l2 = load_csv("tests/fixtures/io/layout_textblock.csv")
70-
assert _l2 == l2
69+
assert _l2 == l2
70+
71+
72+
def test_pdf():
73+
pdf_layout = load_pdf("tests/fixtures/io/example.pdf")
74+
assert len(pdf_layout) == 1
75+
76+
page_layout = pdf_layout[0]
77+
for attr_name in ["width", "height", "index"]:
78+
assert attr_name in page_layout.page_data
79+
80+
assert len(set(ele.type for ele in page_layout)) == 3
81+
# Only three types of font show-up in the file

0 commit comments

Comments
 (0)