1
+ # Copyright 2021 The Layout Parser team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List , Union , Optional , Dict , Tuple
16
+
17
+ import pdfplumber
18
+ import pandas as pd
19
+
20
+ from ..elements import Layout
21
+ from .basic import load_dataframe
22
+
23
+ DEFAULT_PDF_DPI = 72
24
+
25
+
26
+ def extract_words_for_page (
27
+ page : pdfplumber .page .Page ,
28
+ x_tolerance = 1.5 ,
29
+ y_tolerance = 2 ,
30
+ keep_blank_chars = False ,
31
+ use_text_flow = True ,
32
+ horizontal_ltr = True ,
33
+ vertical_ttb = True ,
34
+ extra_attrs = None ,
35
+ ) -> Layout :
36
+ """The helper function used for extracting words from a pdfplumber page
37
+ object.
38
+
39
+ Returns:
40
+ Layout: a layout object representing all extracted pdf tokens on this page.
41
+ """
42
+ if extra_attrs is None :
43
+ extra_attrs = ["fontname" , "size" ]
44
+
45
+ tokens = page .extract_words (
46
+ x_tolerance = x_tolerance ,
47
+ y_tolerance = y_tolerance ,
48
+ keep_blank_chars = keep_blank_chars ,
49
+ use_text_flow = use_text_flow ,
50
+ horizontal_ltr = horizontal_ltr ,
51
+ vertical_ttb = vertical_ttb ,
52
+ extra_attrs = extra_attrs ,
53
+ )
54
+
55
+ df = pd .DataFrame (tokens )
56
+ df [["x0" , "x1" ]] = (
57
+ df [["x0" , "x1" ]].clip (lower = 0 , upper = int (page .width )).astype ("float" )
58
+ )
59
+ df [["top" , "bottom" ]] = (
60
+ df [["top" , "bottom" ]].clip (lower = 0 , upper = int (page .height )).astype ("float" )
61
+ )
62
+
63
+ page_tokens = load_dataframe (
64
+ df .rename (
65
+ columns = {
66
+ "x0" : "x_1" ,
67
+ "x1" : "x_2" ,
68
+ "top" : "y_1" ,
69
+ "bottom" : "y_2" ,
70
+ "fontname" : "type" , # also loading fontname as "type"
71
+ }
72
+ ),
73
+ block_type = "rectangle" ,
74
+ )
75
+
76
+ return page_tokens
77
+
78
+
79
+ def load_pdf (
80
+ filename : str ,
81
+ load_images : bool = False ,
82
+ x_tolerance : int = 1.5 ,
83
+ y_tolerance : int = 2 ,
84
+ keep_blank_chars : bool = False ,
85
+ use_text_flow : bool = True ,
86
+ horizontal_ltr : bool = True ,
87
+ vertical_ttb : bool = True ,
88
+ extra_attrs : Optional [List [str ]] = None ,
89
+ dpi : int = DEFAULT_PDF_DPI ,
90
+ ) -> Union [List [Layout ], Tuple [List [Layout ], List ["Image.Image" ]]]:
91
+ """Load all tokens for each page from a PDF file, and save them
92
+ in a list of Layout objects with the original page order.
93
+
94
+ Args:
95
+ filename (str): The path to the PDF file.
96
+ load_images (bool, optional):
97
+ Whether load screenshot for each page of the PDF file.
98
+ When set to true, the function will return both the layout and
99
+ screenshot image for each page.
100
+ Defaults to False.
101
+ x_tolerance (int, optional):
102
+ The threshold used for extracting "word tokens" from the pdf file.
103
+ It will merge the pdf characters into a word token if the difference
104
+ between the x_2 of one character and the x_1 of the next is less than
105
+ or equal to x_tolerance. See details in `pdf2plumber's documentation
106
+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
107
+ Defaults to 1.5.
108
+ y_tolerance (int, optional):
109
+ The threshold used for extracting "word tokens" from the pdf file.
110
+ It will merge the pdf characters into a word token if the difference
111
+ between the y_2 of one character and the y_1 of the next is less than
112
+ or equal to y_tolerance. See details in `pdf2plumber's documentation
113
+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
114
+ Defaults to 2.
115
+ keep_blank_chars (bool, optional):
116
+ When keep_blank_chars is set to True, it will treat blank characters
117
+ are treated as part of a word, not as a space between words. See
118
+ details in `pdf2plumber's documentation
119
+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
120
+ Defaults to False.
121
+ use_text_flow (bool, optional):
122
+ When use_text_flow is set to True, it will use the PDF's underlying
123
+ flow of characters as a guide for ordering and segmenting the words,
124
+ rather than presorting the characters by x/y position. (This mimics
125
+ how dragging a cursor highlights text in a PDF; as with that, the
126
+ order does not always appear to be logical.) See details in
127
+ `pdf2plumber's documentation
128
+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
129
+ Defaults to True.
130
+ horizontal_ltr (bool, optional):
131
+ When horizontal_ltr is set to True, it means the doc should read
132
+ text from left to right, vice versa.
133
+ Defaults to True.
134
+ vertical_ttb (bool, optional):
135
+ When vertical_ttb is set to True, it means the doc should read
136
+ text from top to bottom, vice versa.
137
+ Defaults to True.
138
+ extra_attrs (Optional[List[str]], optional):
139
+ Passing a list of extra_attrs (e.g., ["fontname", "size"]) will
140
+ restrict each words to characters that share exactly the same
141
+ value for each of those `attributes extracted by pdfplumber
142
+ <https://github.com/jsvine/pdfplumber/blob/develop/README.md#char-properties>`_,
143
+ and the resulting word dicts will indicate those attributes.
144
+ See details in `pdf2plumber's documentation
145
+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
146
+ Defaults to `["fontname", "size"]`.
147
+ dpi (int, optional):
148
+ When loading images of the pdf, you can also specify the resolution
149
+ (or `DPI, dots per inch <https://en.wikipedia.org/wiki/Dots_per_inch>`_)
150
+ for rendering the images. Higher DPI values mean clearer images (also
151
+ larger file sizes).
152
+ Setting dpi will also automatically resizes the extracted pdf_layout
153
+ to match the sizes of the images. Therefore, when visualizing the
154
+ pdf_layouts, it can be rendered appropriately.
155
+ Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
156
+ from the pdfplumber PDF parser.
157
+
158
+ Returns:
159
+ List[Layout]:
160
+ When `load_images=False`, it will only load the pdf_tokens from
161
+ the PDF file. Each element of the list denotes all the tokens appeared
162
+ on a single page, and the list is ordered the same as the original PDF
163
+ page order.
164
+ Tuple[List[Layout], List["Image.Image"]]:
165
+ When `load_images=True`, besides the `all_page_layout`, it will also
166
+ return a list of page images.
167
+
168
+ Examples::
169
+ >>> import layoutparser as lp
170
+ >>> pdf_layout = lp.load_pdf("path/to/pdf")
171
+ >>> pdf_layout[0] # the layout for page 0
172
+ >>> pdf_layout, pdf_images = lp.load_pdf("path/to/pdf", load_images=True)
173
+ >>> lp.draw_box(pdf_images[0], pdf_layout[0])
174
+ """
175
+
176
+ plumber_pdf_object = pdfplumber .open (filename )
177
+
178
+ all_page_layout = []
179
+ for page_id in range (len (plumber_pdf_object .pages )):
180
+ cur_page = plumber_pdf_object .pages [page_id ]
181
+
182
+ page_tokens = extract_words_for_page (
183
+ cur_page ,
184
+ x_tolerance = x_tolerance ,
185
+ y_tolerance = y_tolerance ,
186
+ keep_blank_chars = keep_blank_chars ,
187
+ use_text_flow = use_text_flow ,
188
+ horizontal_ltr = horizontal_ltr ,
189
+ vertical_ttb = vertical_ttb ,
190
+ extra_attrs = extra_attrs ,
191
+ )
192
+
193
+ # Adding metadata for the current page
194
+ page_tokens .page_data ["width" ] = float (cur_page .width )
195
+ page_tokens .page_data ["height" ] = float (cur_page .height )
196
+ page_tokens .page_data ["index" ] = page_id
197
+
198
+ all_page_layout .append (page_tokens )
199
+
200
+ if not load_images :
201
+ return all_page_layout
202
+ else :
203
+ import pdf2image
204
+
205
+ pdf_images = pdf2image .convert_from_path (filename , dpi = dpi )
206
+
207
+ for page_id , page_image in enumerate (pdf_images ):
208
+ image_width , image_height = page_image .size
209
+ page_layout = all_page_layout [page_id ]
210
+ layout_width = page_layout .page_data ["width" ]
211
+ layout_height = page_layout .page_data ["height" ]
212
+ if image_width != layout_width or image_height != layout_height :
213
+ scale_x = image_width / layout_width
214
+ scale_y = image_height / layout_height
215
+ page_layout = page_layout .scale ((scale_x , scale_y ))
216
+ page_layout .page_data ["width" ] = image_width
217
+ page_layout .page_data ["height" ] = image_height
218
+ all_page_layout [page_id ] = page_layout
219
+
220
+ return all_page_layout , pdf_images
0 commit comments