ispras
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎Dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dedoc/api/api_args.py‎
Lines changed: 2 additions & 1 deletion b/‎dedoc/api/api_args.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎dedoc/api/dedoc_api.py‎
Lines changed: 10 additions & 2 deletions b/‎dedoc/api/dedoc_api.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎dedoc/api/process_handler.py‎
Lines changed: 49 additions & 10 deletions b/‎dedoc/api/process_handler.py‎
Lines changed: 49 additions & 10 deletions
diff --git a/‎dedoc/api/web/index.html‎
Lines changed: 7 additions & 2 deletions b/‎dedoc/api/web/index.html‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py‎
Lines changed: 13 additions & 15 deletions b/‎dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎dedoc/download_models.py‎
Lines changed: 6 additions & 3 deletions b/‎dedoc/download_models.py‎
Lines changed: 6 additions & 3 deletions
@@ -148,4 +148,4 @@ crashlytics-build.properties
 fabric.properties
 
 # Mac OS extentions
-*.DS_Store
+*.DS_Store
@@ -8,6 +8,7 @@ ENV RESOURCES_PATH "/dedoc_root/resources"
 
 COPY requirements.txt .
 RUN pip3 install --no-cache-dir -r requirements.txt
+RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends fontforge
 
 RUN mkdir /dedoc_root
 RUN mkdir /dedoc_root/dedoc
 
@@ -94,6 +94,7 @@ Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io
 * Article [ISPRAS@FinTOC-2022 shared task: Two-stage TOC generation model](https://aclanthology.org/2022.fnp-1.13.pdf) for the [FinTOC 2022 Shared Task](https://wp.lancs.ac.uk/cfie/fintoc2022/). We are the winners :smiley: :trophy:!
 * Article on habr.com [Dedoc: как автоматически извлечь из текстового документа всё и даже немного больше](https://habr.com/ru/companies/isp_ras/articles/779390/) in Russian (2023)
 * Article [Dedoc: A Universal System for Extracting Content and Logical Structure From Textual Documents](https://ieeexplore.ieee.org/abstract/document/10508151/) in English (2023)
+* Article [Automatic verification of the text layer correctness in PDF documents](https://ieeexplore.ieee.org/abstract/document/10659388/) in English (2024)
 
 # Join Our Community
 
 
@@ -1 +1 @@
-2.3.2
+2.4
@@ -24,10 +24,11 @@ class QueryParameters:
     table_type: str = Form("", description="Pipeline mode for table recognition")
 
     # pdf handling
-    pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
+    pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby", "bad_encoding"],
                                     description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
     fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
                                              description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
+    each_page_textual_layer_detection: str = Form("false", enum=["true", "false"], description="Detect textual layer on each page. Slower but more accurate.")
     language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
     pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
     is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
 
@@ -3,6 +3,7 @@
 import json
 import os
 import tempfile
+import traceback
 from typing import Optional
 
 from fastapi import Depends, FastAPI, File, Request, Response, UploadFile
@@ -13,23 +14,25 @@
 import dedoc.version
 from dedoc.api.api_args import QueryParameters
 from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
-from dedoc.api.process_handler import ProcessHandler
+from dedoc.api.process_handler import CancellationProcessHandler, ProcessHandler
 from dedoc.api.schema.parsed_document import ParsedDocument
 from dedoc.common.exceptions.dedoc_error import DedocError
 from dedoc.common.exceptions.missing_file_error import MissingFileError
 from dedoc.config import get_config
+from dedoc.utils.parameter_utils import get_bool_value
 from dedoc.utils.utils import save_upload_file
 
 config = get_config()
 logger = config["logger"]
 PORT = config["api_port"]
+ENABLE_CANCELLATION = get_bool_value(os.getenv("ENABLE_CANCELLATION"), True)
 static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "web")
 static_files_dirs = config.get("static_files_dirs")
 
 app = FastAPI()
 app.mount("/web", StaticFiles(directory=config.get("static_path", static_path)), name="web")
 module_api_args = importlib.import_module(config["import_path_init_api_args"])
-process_handler = ProcessHandler(logger=logger)
+process_handler = CancellationProcessHandler(logger=logger) if ENABLE_CANCELLATION else ProcessHandler(logger=logger)
 
 
 @app.get("/")
@@ -136,6 +139,11 @@ async def exception_handler(request: Request, exc: DedocError) -> Response:
     return JSONResponse(status_code=exc.code, content=result)
 
 
+@app.exception_handler(Exception)
+async def any_exception_handler(request: Request, exc: Exception) -> JSONResponse:
+    return JSONResponse(status_code=500, content={"message": f"Exception {exc}\n{traceback.format_exc()}"})
+
+
 def get_api() -> FastAPI:
     return app
 
 
@@ -5,6 +5,7 @@
 import pickle
 import signal
 import traceback
+from abc import abstractmethod
 from multiprocessing import Process, Queue
 from typing import Optional
 from urllib.request import Request
@@ -18,7 +19,49 @@
 from dedoc.dedoc_manager import DedocManager
 
 
-class ProcessHandler:
+class AbstractProcessHandler:
+
+    def __init__(self, logger: logging.Logger) -> None:
+        self.logger = logger
+
+    @abstractmethod
+    async def handle(self, request: Request, parameters: dict, file_path: str, tmpdir: str) -> Optional[ParsedDocument]:
+        pass
+
+    def _add_base64_info_to_attachments(self, document_tree: ParsedDocument, attachments_dir: str) -> None:
+        for attachment in document_tree.attachments:
+            with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
+                attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))
+
+
+class ProcessHandler(AbstractProcessHandler):
+    """
+    Simple synchronous document handler.
+    """
+    def __init__(self, logger: logging.Logger) -> None:
+        super().__init__(logger=logger)
+        self.manager = DedocManager(config=get_config())
+        self.logger.info("Using ProcessHandler, do not support parsing process termination")
+
+    async def handle(self, request: Request, parameters: dict, file_path: str, tmpdir: str) -> Optional[ParsedDocument]:
+        try:
+            return_format = str(parameters.get("return_format", "json")).lower()
+            document_tree = self.manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})
+
+            if return_format == "html":
+                self._add_base64_info_to_attachments(document_tree, tmpdir)
+            return document_tree.to_api_schema()
+
+        except DedocError as e:
+            self.logger.error(f"Exception {e}: {e.msg_api}\n{traceback.format_exc()}")
+            raise e
+        except Exception as e:
+            exc_message = f"Exception {e}\n{traceback.format_exc()}"
+            self.logger.error(exc_message)
+            raise e
+
+
+class CancellationProcessHandler(AbstractProcessHandler):
     """
     Class for file parsing by DedocManager with support for client disconnection.
     If client disconnects during file parsing, the process of parsing is fully terminated and API is available to receive new connections.
@@ -32,11 +75,12 @@ class ProcessHandler:
     6. If client disconnects, the child process is terminated. The new child process with queues will start with the new request
     """
     def __init__(self, logger: logging.Logger) -> None:
+        super().__init__(logger=logger)
         self.input_queue = Queue()
         self.output_queue = Queue()
-        self.logger = logger
-        self.process = Process(target=self.__parse_file, args=[self.input_queue, self.output_queue])
+        self.process = Process(target=self._parse_file, args=[self.input_queue, self.output_queue])
         self.process.start()
+        self.logger.info("Using CancellationProcessHandler, support parsing process termination when client disconnects")
 
     async def handle(self, request: Request, parameters: dict, file_path: str, tmpdir: str) -> Optional[ParsedDocument]:
         """
@@ -70,7 +114,7 @@ async def handle(self, request: Request, parameters: dict, file_path: str, tmpdi
 
         raise DedocError.from_dict(result)
 
-    def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
+    def _parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
         """
         Function for file parsing in a separate (child) process.
         It's a background process, i.e. it is waiting for a task in the input queue.
@@ -95,7 +139,7 @@ def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
                 document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmp_dir})
 
                 if return_format == "html":
-                    self.__add_base64_info_to_attachments(document_tree, tmp_dir)
+                    self._add_base64_info_to_attachments(document_tree, tmp_dir)
 
                 output_queue.put(pickle.dumps(document_tree.to_api_schema()), block=True)
                 manager.logger.info("Parsing process put task to the output queue")
@@ -108,8 +152,3 @@ def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
                 filename = "" if file_path is None else os.path.basename(file_path)
                 manager.logger.error(exc_message)
                 output_queue.put(pickle.dumps({"msg": exc_message, "filename": filename}), block=True)
-
-    def __add_base64_info_to_attachments(self, document_tree: ParsedDocument, attachments_dir: str) -> None:
-        for attachment in document_tree.attachments:
-            with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
-                attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))
 
@@ -100,7 +100,7 @@ <h4>Attachments handling</h4>
 
         <div class="parameters">
             <h4>PDF handling</h4>
-            <details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
+            <details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, each_page_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
                 <br>
                 <p>
                     <label>
@@ -110,14 +110,19 @@ <h4>PDF handling</h4>
                             <option value="auto">auto</option>
                             <option value="auto_tabby" selected>auto_tabby</option>
                             <option value="tabby">tabby</option>
+                            <option value="bad_encoding">bad_encoding</option>
                         </select> pdf_with_text_layer
                     </label>
                 </p>
 
                 <p>
                     <label><input name="fast_textual_layer_detection" type="checkbox" value="true"> fast_textual_layer_detection</label>
                 </p>
-                
+
+                <p>
+                    <label><input name="each_page_textual_layer_detection" type="checkbox" value="true"> each_page_textual_layer_detection</label>
+                </p>
+
                 <p>
                     <label> language
                         <input name="language" list="language" size="8" placeholder="rus+eng">
 
@@ -1,6 +1,6 @@
 from typing import List, Optional, Tuple
 
-from PyPDF2.pdf import PageObject, PdfFileReader
+from pypdf import PageObject, PdfReader
 
 from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
 from dedoc.data_structures.attached_file import AttachedFile
@@ -22,15 +22,15 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
         the methods' parameters.
         """
         import os
-        from PyPDF2.utils import PdfReadError
         from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
+        from pypdf.errors import PdfReadError
 
         parameters = {} if parameters is None else parameters
         filename = os.path.basename(file_path)
 
         with open(file_path, "rb") as handler:
             try:
-                reader = PdfFileReader(handler)
+                reader = PdfReader(handler)
             except Exception as e:
                 self.logger.warning(f"can't handle {filename}, get {e}")
                 return []
@@ -55,13 +55,13 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
         if "/Annots" in page.keys():
             for annot in page["/Annots"]:
                 # Other subtypes, such as /Link, cause errors
-                subtype = annot.getObject().get("/Subtype")
+                subtype = annot.get_object().get("/Subtype")
                 if subtype == "/FileAttachment":
-                    name = annot.getObject()["/FS"]["/UF"]
-                    data = annot.getObject()["/FS"]["/EF"]["/F"].getData()  # The file containing the stream data.
+                    name = annot.get_object()["/FS"]["/UF"]
+                    data = annot.get_object()["/FS"]["/EF"]["/F"].get_data()  # The file containing the stream data.
                     attachments.append([name, data])
-                if subtype == "/Text" and annot.getObject().get("/Name") == "/Comment":  # it is messages (notes) in PDF
-                    note = annot.getObject()
+                if subtype == "/Text" and annot.get_object().get("/Name") == "/Comment":  # it is messages (notes) in PDF
+                    note = annot.get_object()
                     created_time = convert_datetime(note["/CreationDate"]) if "/CreationDate" in note else None
                     modified_time = convert_datetime(note["/M"]) if "/M" in note else None
                     user = note.get("/T")
@@ -71,17 +71,15 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
                     attachments.append((name, bytes(content)))
         return attachments
 
-    def __get_page_level_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]:
-        cnt_page = reader.getNumPages()
+    def __get_page_level_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]:
         attachments = []
-        for i in range(cnt_page):
-            page = reader.getPage(i)
+        for page in reader.pages:
             attachments_on_page = self.__get_notes(page)
             attachments.extend(attachments_on_page)
 
         return attachments
 
-    def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]:
+    def __get_root_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]:
         """
         Retrieves the file attachments of the PDF as a dictionary of file names and the file data as a bytestring.
 
@@ -96,9 +94,9 @@ def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes
             for f in file_names:
                 if isinstance(f, str):
                     data_index = file_names.index(f) + 1
-                    dict_object = file_names[data_index].getObject()
+                    dict_object = file_names[data_index].get_object()
                     if "/EF" in dict_object and "/F" in dict_object["/EF"]:
-                        data = dict_object["/EF"]["/F"].getData()
+                        data = dict_object["/EF"]["/F"].get_data()
                         name = dict_object.get("/UF", f"pdf_attach_{uuid.uuid4()}")
                         attachments.append((name, data))
 
 
@@ -10,17 +10,18 @@
     font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
     paragraph_classifier="97c4b78bc20d87ec7d53389e09f1ca35c6ade067",
     line_type_classifiers="6ad0eacbfdea065b658cb6f039d13f75245d51ae",
-    fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8"
+    fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8",
+    torch_cnn="5333909f858f5f632df478ef5a53af6dfd26f2e1"
 )
 
 
-def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str) -> None:
+def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str, user_name: str = "dedoc") -> None:
     import os
     import shutil
     from huggingface_hub import hf_hub_download
 
     os.makedirs(out_dir, exist_ok=True)
-    path = os.path.realpath(hf_hub_download(repo_id=f"dedoc/{repo_name}", filename=hub_name, revision=model_hash_dict[repo_name]))
+    path = os.path.realpath(hf_hub_download(repo_id=f"{user_name}/{repo_name}", filename=hub_name, revision=model_hash_dict[repo_name]))
     shutil.move(path, os.path.join(out_dir, out_name))
 
 
@@ -51,6 +52,8 @@ def download(resources_path: str) -> None:
                               repo_name="fintoc_classifiers",
                               hub_name=f"{classifier_type}_classifier_{language}_txt_layer.json")
 
+    download_from_hub(out_dir=resources_path, out_name="glyph_recognizer.pt", repo_name="torch_cnn", hub_name="rus_eng.pt", user_name="sinkudo")
+
 
 if __name__ == "__main__":
     from dedoc.config import get_config