Skip to content

Commit be737a7

Browse files
NastyBogetsunveilsinkudo
authored
new version 2.4 (#532)
Co-authored-by: sunveil <[email protected]> Co-authored-by: sinkudo <[email protected]>
1 parent c074035 commit be737a7

File tree

62 files changed

+1513
-267
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+1513
-267
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,4 +148,4 @@ crashlytics-build.properties
148148
fabric.properties
149149

150150
# Mac OS extentions
151-
*.DS_Store
151+
*.DS_Store

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ ENV RESOURCES_PATH "/dedoc_root/resources"
88

99
COPY requirements.txt .
1010
RUN pip3 install --no-cache-dir -r requirements.txt
11+
RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends fontforge
1112

1213
RUN mkdir /dedoc_root
1314
RUN mkdir /dedoc_root/dedoc

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io
9494
* Article [ISPRAS@FinTOC-2022 shared task: Two-stage TOC generation model](https://aclanthology.org/2022.fnp-1.13.pdf) for the [FinTOC 2022 Shared Task](https://wp.lancs.ac.uk/cfie/fintoc2022/). We are the winners :smiley: :trophy:!
9595
* Article on habr.com [Dedoc: как автоматически извлечь из текстового документа всё и даже немного больше](https://habr.com/ru/companies/isp_ras/articles/779390/) in Russian (2023)
9696
* Article [Dedoc: A Universal System for Extracting Content and Logical Structure From Textual Documents](https://ieeexplore.ieee.org/abstract/document/10508151/) in English (2023)
97+
* Article [Automatic verification of the text layer correctness in PDF documents](https://ieeexplore.ieee.org/abstract/document/10659388/) in English (2024)
9798

9899
# Join Our Community
99100

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.3.2
1+
2.4

dedoc/api/api_args.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,11 @@ class QueryParameters:
2424
table_type: str = Form("", description="Pipeline mode for table recognition")
2525

2626
# pdf handling
27-
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
27+
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby", "bad_encoding"],
2828
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
2929
fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
3030
description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
31+
each_page_textual_layer_detection: str = Form("false", enum=["true", "false"], description="Detect textual layer on each page. Slower but more accurate.")
3132
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
3233
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
3334
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],

dedoc/api/dedoc_api.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import json
44
import os
55
import tempfile
6+
import traceback
67
from typing import Optional
78

89
from fastapi import Depends, FastAPI, File, Request, Response, UploadFile
@@ -13,23 +14,25 @@
1314
import dedoc.version
1415
from dedoc.api.api_args import QueryParameters
1516
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
16-
from dedoc.api.process_handler import ProcessHandler
17+
from dedoc.api.process_handler import CancellationProcessHandler, ProcessHandler
1718
from dedoc.api.schema.parsed_document import ParsedDocument
1819
from dedoc.common.exceptions.dedoc_error import DedocError
1920
from dedoc.common.exceptions.missing_file_error import MissingFileError
2021
from dedoc.config import get_config
22+
from dedoc.utils.parameter_utils import get_bool_value
2123
from dedoc.utils.utils import save_upload_file
2224

2325
config = get_config()
2426
logger = config["logger"]
2527
PORT = config["api_port"]
28+
ENABLE_CANCELLATION = get_bool_value(os.getenv("ENABLE_CANCELLATION"), True)
2629
static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "web")
2730
static_files_dirs = config.get("static_files_dirs")
2831

2932
app = FastAPI()
3033
app.mount("/web", StaticFiles(directory=config.get("static_path", static_path)), name="web")
3134
module_api_args = importlib.import_module(config["import_path_init_api_args"])
32-
process_handler = ProcessHandler(logger=logger)
35+
process_handler = CancellationProcessHandler(logger=logger) if ENABLE_CANCELLATION else ProcessHandler(logger=logger)
3336

3437

3538
@app.get("/")
@@ -136,6 +139,11 @@ async def exception_handler(request: Request, exc: DedocError) -> Response:
136139
return JSONResponse(status_code=exc.code, content=result)
137140

138141

142+
@app.exception_handler(Exception)
143+
async def any_exception_handler(request: Request, exc: Exception) -> JSONResponse:
144+
return JSONResponse(status_code=500, content={"message": f"Exception {exc}\n{traceback.format_exc()}"})
145+
146+
139147
def get_api() -> FastAPI:
140148
return app
141149

dedoc/api/process_handler.py

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pickle
66
import signal
77
import traceback
8+
from abc import abstractmethod
89
from multiprocessing import Process, Queue
910
from typing import Optional
1011
from urllib.request import Request
@@ -18,7 +19,49 @@
1819
from dedoc.dedoc_manager import DedocManager
1920

2021

21-
class ProcessHandler:
22+
class AbstractProcessHandler:
23+
24+
def __init__(self, logger: logging.Logger) -> None:
25+
self.logger = logger
26+
27+
@abstractmethod
28+
async def handle(self, request: Request, parameters: dict, file_path: str, tmpdir: str) -> Optional[ParsedDocument]:
29+
pass
30+
31+
def _add_base64_info_to_attachments(self, document_tree: ParsedDocument, attachments_dir: str) -> None:
32+
for attachment in document_tree.attachments:
33+
with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
34+
attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))
35+
36+
37+
class ProcessHandler(AbstractProcessHandler):
38+
"""
39+
Simple synchronous document handler.
40+
"""
41+
def __init__(self, logger: logging.Logger) -> None:
42+
super().__init__(logger=logger)
43+
self.manager = DedocManager(config=get_config())
44+
self.logger.info("Using ProcessHandler, do not support parsing process termination")
45+
46+
async def handle(self, request: Request, parameters: dict, file_path: str, tmpdir: str) -> Optional[ParsedDocument]:
47+
try:
48+
return_format = str(parameters.get("return_format", "json")).lower()
49+
document_tree = self.manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})
50+
51+
if return_format == "html":
52+
self._add_base64_info_to_attachments(document_tree, tmpdir)
53+
return document_tree.to_api_schema()
54+
55+
except DedocError as e:
56+
self.logger.error(f"Exception {e}: {e.msg_api}\n{traceback.format_exc()}")
57+
raise e
58+
except Exception as e:
59+
exc_message = f"Exception {e}\n{traceback.format_exc()}"
60+
self.logger.error(exc_message)
61+
raise e
62+
63+
64+
class CancellationProcessHandler(AbstractProcessHandler):
2265
"""
2366
Class for file parsing by DedocManager with support for client disconnection.
2467
If client disconnects during file parsing, the process of parsing is fully terminated and API is available to receive new connections.
@@ -32,11 +75,12 @@ class ProcessHandler:
3275
6. If client disconnects, the child process is terminated. The new child process with queues will start with the new request
3376
"""
3477
def __init__(self, logger: logging.Logger) -> None:
78+
super().__init__(logger=logger)
3579
self.input_queue = Queue()
3680
self.output_queue = Queue()
37-
self.logger = logger
38-
self.process = Process(target=self.__parse_file, args=[self.input_queue, self.output_queue])
81+
self.process = Process(target=self._parse_file, args=[self.input_queue, self.output_queue])
3982
self.process.start()
83+
self.logger.info("Using CancellationProcessHandler, support parsing process termination when client disconnects")
4084

4185
async def handle(self, request: Request, parameters: dict, file_path: str, tmpdir: str) -> Optional[ParsedDocument]:
4286
"""
@@ -70,7 +114,7 @@ async def handle(self, request: Request, parameters: dict, file_path: str, tmpdi
70114

71115
raise DedocError.from_dict(result)
72116

73-
def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
117+
def _parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
74118
"""
75119
Function for file parsing in a separate (child) process.
76120
It's a background process, i.e. it is waiting for a task in the input queue.
@@ -95,7 +139,7 @@ def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
95139
document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmp_dir})
96140

97141
if return_format == "html":
98-
self.__add_base64_info_to_attachments(document_tree, tmp_dir)
142+
self._add_base64_info_to_attachments(document_tree, tmp_dir)
99143

100144
output_queue.put(pickle.dumps(document_tree.to_api_schema()), block=True)
101145
manager.logger.info("Parsing process put task to the output queue")
@@ -108,8 +152,3 @@ def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
108152
filename = "" if file_path is None else os.path.basename(file_path)
109153
manager.logger.error(exc_message)
110154
output_queue.put(pickle.dumps({"msg": exc_message, "filename": filename}), block=True)
111-
112-
def __add_base64_info_to_attachments(self, document_tree: ParsedDocument, attachments_dir: str) -> None:
113-
for attachment in document_tree.attachments:
114-
with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
115-
attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))

dedoc/api/web/index.html

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ <h4>Attachments handling</h4>
100100

101101
<div class="parameters">
102102
<h4>PDF handling</h4>
103-
<details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
103+
<details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, each_page_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
104104
<br>
105105
<p>
106106
<label>
@@ -110,14 +110,19 @@ <h4>PDF handling</h4>
110110
<option value="auto">auto</option>
111111
<option value="auto_tabby" selected>auto_tabby</option>
112112
<option value="tabby">tabby</option>
113+
<option value="bad_encoding">bad_encoding</option>
113114
</select> pdf_with_text_layer
114115
</label>
115116
</p>
116117

117118
<p>
118119
<label><input name="fast_textual_layer_detection" type="checkbox" value="true"> fast_textual_layer_detection</label>
119120
</p>
120-
121+
122+
<p>
123+
<label><input name="each_page_textual_layer_detection" type="checkbox" value="true"> each_page_textual_layer_detection</label>
124+
</p>
125+
121126
<p>
122127
<label> language
123128
<input name="language" list="language" size="8" placeholder="rus+eng">

dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from typing import List, Optional, Tuple
22

3-
from PyPDF2.pdf import PageObject, PdfFileReader
3+
from pypdf import PageObject, PdfReader
44

55
from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
66
from dedoc.data_structures.attached_file import AttachedFile
@@ -22,15 +22,15 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
2222
the methods' parameters.
2323
"""
2424
import os
25-
from PyPDF2.utils import PdfReadError
2625
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
26+
from pypdf.errors import PdfReadError
2727

2828
parameters = {} if parameters is None else parameters
2929
filename = os.path.basename(file_path)
3030

3131
with open(file_path, "rb") as handler:
3232
try:
33-
reader = PdfFileReader(handler)
33+
reader = PdfReader(handler)
3434
except Exception as e:
3535
self.logger.warning(f"can't handle {filename}, get {e}")
3636
return []
@@ -55,13 +55,13 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
5555
if "/Annots" in page.keys():
5656
for annot in page["/Annots"]:
5757
# Other subtypes, such as /Link, cause errors
58-
subtype = annot.getObject().get("/Subtype")
58+
subtype = annot.get_object().get("/Subtype")
5959
if subtype == "/FileAttachment":
60-
name = annot.getObject()["/FS"]["/UF"]
61-
data = annot.getObject()["/FS"]["/EF"]["/F"].getData() # The file containing the stream data.
60+
name = annot.get_object()["/FS"]["/UF"]
61+
data = annot.get_object()["/FS"]["/EF"]["/F"].get_data() # The file containing the stream data.
6262
attachments.append([name, data])
63-
if subtype == "/Text" and annot.getObject().get("/Name") == "/Comment": # it is messages (notes) in PDF
64-
note = annot.getObject()
63+
if subtype == "/Text" and annot.get_object().get("/Name") == "/Comment": # it is messages (notes) in PDF
64+
note = annot.get_object()
6565
created_time = convert_datetime(note["/CreationDate"]) if "/CreationDate" in note else None
6666
modified_time = convert_datetime(note["/M"]) if "/M" in note else None
6767
user = note.get("/T")
@@ -71,17 +71,15 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
7171
attachments.append((name, bytes(content)))
7272
return attachments
7373

74-
def __get_page_level_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]:
75-
cnt_page = reader.getNumPages()
74+
def __get_page_level_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]:
7675
attachments = []
77-
for i in range(cnt_page):
78-
page = reader.getPage(i)
76+
for page in reader.pages:
7977
attachments_on_page = self.__get_notes(page)
8078
attachments.extend(attachments_on_page)
8179

8280
return attachments
8381

84-
def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]:
82+
def __get_root_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]:
8583
"""
8684
Retrieves the file attachments of the PDF as a dictionary of file names and the file data as a bytestring.
8785
@@ -96,9 +94,9 @@ def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes
9694
for f in file_names:
9795
if isinstance(f, str):
9896
data_index = file_names.index(f) + 1
99-
dict_object = file_names[data_index].getObject()
97+
dict_object = file_names[data_index].get_object()
10098
if "/EF" in dict_object and "/F" in dict_object["/EF"]:
101-
data = dict_object["/EF"]["/F"].getData()
99+
data = dict_object["/EF"]["/F"].get_data()
102100
name = dict_object.get("/UF", f"pdf_attach_{uuid.uuid4()}")
103101
attachments.append((name, data))
104102

dedoc/download_models.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,18 @@
1010
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
1111
paragraph_classifier="97c4b78bc20d87ec7d53389e09f1ca35c6ade067",
1212
line_type_classifiers="6ad0eacbfdea065b658cb6f039d13f75245d51ae",
13-
fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8"
13+
fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8",
14+
torch_cnn="5333909f858f5f632df478ef5a53af6dfd26f2e1"
1415
)
1516

1617

17-
def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str) -> None:
18+
def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str, user_name: str = "dedoc") -> None:
1819
import os
1920
import shutil
2021
from huggingface_hub import hf_hub_download
2122

2223
os.makedirs(out_dir, exist_ok=True)
23-
path = os.path.realpath(hf_hub_download(repo_id=f"dedoc/{repo_name}", filename=hub_name, revision=model_hash_dict[repo_name]))
24+
path = os.path.realpath(hf_hub_download(repo_id=f"{user_name}/{repo_name}", filename=hub_name, revision=model_hash_dict[repo_name]))
2425
shutil.move(path, os.path.join(out_dir, out_name))
2526

2627

@@ -51,6 +52,8 @@ def download(resources_path: str) -> None:
5152
repo_name="fintoc_classifiers",
5253
hub_name=f"{classifier_type}_classifier_{language}_txt_layer.json")
5354

55+
download_from_hub(out_dir=resources_path, out_name="glyph_recognizer.pt", repo_name="torch_cnn", hub_name="rus_eng.pt", user_name="sinkudo")
56+
5457

5558
if __name__ == "__main__":
5659
from dedoc.config import get_config

0 commit comments

Comments
 (0)