|
2 | 2 | from collections import namedtuple |
3 | 3 | from typing import Dict, Iterator, List, Optional, Set, Tuple |
4 | 4 |
|
5 | | -import numpy as np |
6 | 5 | from dedocutils.data_structures.bbox import BBox |
7 | 6 | from numpy import ndarray |
8 | 7 |
|
|
13 | 12 | from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation |
14 | 13 | from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment |
15 | 14 | from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable |
16 | | -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer |
| 15 | + |
17 | 16 |
|
18 | 17 | ParametersForParseDoc = namedtuple("ParametersForParseDoc", [ |
19 | 18 | "orient_analysis_cells", |
@@ -45,6 +44,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti |
45 | 44 |
|
46 | 45 | from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.metadata_extractor import LineMetadataExtractor |
47 | 46 | from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor |
| 47 | + from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer |
48 | 48 | from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer |
49 | 49 | from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker |
50 | 50 | from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor |
@@ -153,24 +153,24 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( |
153 | 153 | metadata["rotated_page_angles"] = page_angles |
154 | 154 | return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata |
155 | 155 |
|
156 | | - def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \ |
157 | | - Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]]: |
| 156 | + def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \ |
| 157 | + Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]]: |
158 | 158 | from joblib import Parallel, delayed |
159 | 159 | from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader |
160 | 160 |
|
161 | 161 | gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images) |
162 | 162 | page_range = range(first_page, first_page + len(gost_analyzed_images)) |
163 | 163 | gost_analyzed_images = dict(zip(page_range, gost_analyzed_images)) |
164 | 164 | if isinstance(self, PdfTxtlayerReader): |
165 | | - self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()])) |
| 165 | + self.gost_frame_boxes = dict(zip(page_range, [(item[1], item[2]) for item in gost_analyzed_images.values()])) |
166 | 166 | result = Parallel(n_jobs=self.config["n_jobs"])( |
167 | 167 | delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in |
168 | 168 | gost_analyzed_images.items() |
169 | 169 | ) |
170 | 170 | return result, gost_analyzed_images |
171 | 171 |
|
172 | 172 | def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment], |
173 | | - gost_analyzed_images: Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]) -> None: |
| 173 | + gost_analyzed_images: Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]) -> None: |
174 | 174 | # shift unref_tables |
175 | 175 | for scan_table in unref_tables: |
176 | 176 | for location in scan_table.locations: |
|
0 commit comments