docling-project
diff --git a/‎.github/codecov.yml
Lines changed: 17 additions & 0 deletions b/‎.github/codecov.yml
Lines changed: 17 additions & 0 deletions
diff --git a/‎.github/workflows/cd.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/cd.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/checks.yml
Lines changed: 15 additions & 1 deletion b/‎.github/workflows/checks.yml
Lines changed: 15 additions & 1 deletion
diff --git a/‎.github/workflows/ci.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 13 additions & 30 deletions b/‎.pre-commit-config.yaml
Lines changed: 13 additions & 30 deletions
diff --git a/‎docling/backend/asciidoc_backend.py
Lines changed: 7 additions & 15 deletions b/‎docling/backend/asciidoc_backend.py
Lines changed: 7 additions & 15 deletions
diff --git a/‎docling/backend/csv_backend.py
Lines changed: 1 addition & 1 deletion b/‎docling/backend/csv_backend.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎docling/backend/docling_parse_backend.py
Lines changed: 2 additions & 2 deletions b/‎docling/backend/docling_parse_backend.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎docling/backend/docling_parse_v2_backend.py
Lines changed: 2 additions & 2 deletions b/‎docling/backend/docling_parse_v2_backend.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎docling/backend/docling_parse_v4_backend.py
Lines changed: 3 additions & 4 deletions b/‎docling/backend/docling_parse_v4_backend.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎docling/backend/docx/latex/latex_dict.py
Lines changed: 0 additions & 5 deletions b/‎docling/backend/docx/latex/latex_dict.py
Lines changed: 0 additions & 5 deletions
diff --git a/‎docling/backend/docx/latex/omml.py
Lines changed: 4 additions & 7 deletions b/‎docling/backend/docx/latex/omml.py
Lines changed: 4 additions & 7 deletions
@@ -0,0 +1,17 @@
+codecov:
+  # https://docs.codecov.io/docs/comparing-commits
+  allow_coverage_offsets: true
+coverage:
+  status:
+    project:
+      default:
+        informational: true
+        target: auto  # auto compares coverage to the previous base commit
+        flags:
+          - docling
+  comment:
+    layout: "reach, diff, flags, files"
+    behavior: default
+    require_changes: false  # if true: only post the comment if coverage changes
+    branches:               # branch names that can post comment
+      - "main"
@@ -10,6 +10,8 @@ env:
 jobs:
   code-checks:
     uses: ./.github/workflows/checks.yml
+    with:
+      push_coverage: false
   pre-release-check:
     runs-on: ubuntu-latest
     outputs:
 
@@ -1,5 +1,13 @@
 on:
   workflow_call:
+    inputs:
+      push_coverage:
+          type: boolean
+          description: "If true, the coverage results are pushed to codecov.io."
+          default: true
+    secrets:
+      CODECOV_TOKEN:
+        required: false        
 
 env:
   HF_HUB_DOWNLOAD_TIMEOUT: "60"
@@ -32,7 +40,13 @@ jobs:
         run: poetry install --all-extras
       - name: Testing
         run: |
-          poetry run pytest -v tests
+          poetry run pytest -v --cov=docling --cov-report=xml tests
+      - name: Upload coverage to Codecov
+        if: inputs.push_coverage
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          file: ./coverage.xml
       - name: Run examples
         run: |
           for file in docs/examples/*.py; do
 
@@ -17,3 +17,5 @@ jobs:
   code-checks:
     if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
     uses: ./.github/workflows/checks.yml
+    secrets:
+      CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -1,43 +1,26 @@
 fail_fast: true
 repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.5
+    hooks:
+      # Run the Ruff formatter.
+      - id: ruff-format
+        name: "Ruff formatter"
+        args: [--config=pyproject.toml]
+        files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
+      # Run the Ruff linter.
+      - id: ruff
+        name: "Ruff linter"
+        args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
+        files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
   - repo: local
     hooks:
-      - id: black
-        name: Black
-        entry: poetry run black docling docs/examples tests
-        pass_filenames: false
-        language: system
-        files: '\.py$'
-      - id: isort
-        name: isort
-        entry: poetry run isort docling docs/examples tests
-        pass_filenames: false
-        language: system
-        files: '\.py$'
-#      - id: flake8
-#        name: flake8
-#        entry: poetry run flake8 docling
-#        pass_filenames: false
-#        language: system
-#        files: '\.py$'
       - id: mypy
         name: MyPy
         entry: poetry run mypy docling
         pass_filenames: false
         language: system
         files: '\.py$'
-      - id: nbqa_black
-        name: nbQA Black
-        entry: poetry run nbqa black docs/examples
-        pass_filenames: false
-        language: system
-        files: '\.ipynb$'
-      - id: nbqa_isort
-        name: nbQA isort
-        entry: poetry run nbqa isort docs/examples
-        pass_filenames: false
-        language: system
-        files: '\.ipynb$'
       - id: poetry
         name: Poetry check
         entry: poetry check --lock
 
@@ -34,7 +34,7 @@ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
                 text_stream = self.path_or_stream.getvalue().decode("utf-8")
                 self.lines = text_stream.split("\n")
             if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                     self.lines = f.readlines()
             self.valid = True
 
@@ -75,14 +75,12 @@ def convert(self) -> DoclingDocument:
 
         return doc
 
-    def _parse(self, doc: DoclingDocument):
+    def _parse(self, doc: DoclingDocument):  # noqa: C901
         """
         Main function that orchestrates the parsing by yielding components:
         title, section headers, text, lists, and tables.
         """
 
-        content = ""
-
         in_list = False
         in_table = False
 
@@ -95,7 +93,7 @@ def _parse(self, doc: DoclingDocument):
         # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
         indents: dict[int, Union[GroupItem, None]] = {}
 
-        for i in range(0, 10):
+        for i in range(10):
             parents[i] = None
             indents[i] = None
 
@@ -125,7 +123,6 @@ def _parse(self, doc: DoclingDocument):
 
             # Lists
             elif self._is_list_item(line):
-
                 _log.debug(f"line: {line}")
                 item = self._parse_list_item(line)
                 _log.debug(f"parsed list-item: {item}")
@@ -147,7 +144,6 @@ def _parse(self, doc: DoclingDocument):
                     indents[level + 1] = item["indent"]
 
                 elif in_list and item["indent"] < indents[level]:
-
                     # print(item["indent"], " => ", indents[level])
                     while item["indent"] < indents[level]:
                         # print(item["indent"], " => ", indents[level])
@@ -176,7 +172,6 @@ def _parse(self, doc: DoclingDocument):
             elif in_table and (
                 (not self._is_table_line(line)) or line.strip() == "|==="
             ):  # end of table
-
                 caption = None
                 if len(caption_data) > 0:
                     caption = doc.add_text(
@@ -195,7 +190,6 @@ def _parse(self, doc: DoclingDocument):
 
             # Picture
             elif self._is_picture(line):
-
                 caption = None
                 if len(caption_data) > 0:
                     caption = doc.add_text(
@@ -250,7 +244,6 @@ def _parse(self, doc: DoclingDocument):
                 text_data = []
 
             elif len(line.strip()) > 0:  # allow multiline texts
-
                 item = self._parse_text(line)
                 text_data.append(item["text"])
 
@@ -273,14 +266,14 @@ def _parse(self, doc: DoclingDocument):
 
     def _get_current_level(self, parents):
         for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                 return k - 1
 
         return 0
 
     def _get_current_parent(self, parents):
         for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                 return parents[k - 1]
 
         return None
@@ -328,15 +321,15 @@ def _parse_list_item(self, line):
                     "marker": marker,
                     "text": text.strip(),
                     "numbered": False,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                 }
             else:
                 return {
                     "type": "list_item",
                     "marker": marker,
                     "text": text.strip(),
                     "numbered": True,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                 }
         else:
             # Fallback if no match
@@ -357,7 +350,6 @@ def _parse_table_line(self, line):
         return [cell.strip() for cell in line.split("|") if cell.strip()]
 
     def _populate_table_as_grid(self, table_data):
-
         num_rows = len(table_data)
 
         # Adjust the table data into a grid format
 
@@ -58,7 +58,7 @@ def convert(self) -> DoclingDocument:
         head = self.content.readline()
         dialect = csv.Sniffer().sniff(head, ",;\t|:")
         _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
-        if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
+        if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
             raise RuntimeError(
                 f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
             )
 
@@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import List, Optional, Union
 
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
@@ -156,7 +157,6 @@ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
-
         page_size = self.get_size()
 
         if not cropbox:
 
@@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -172,7 +173,6 @@ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
-
         page_size = self.get_size()
 
         if not cropbox:
 
@@ -1,14 +1,14 @@
 import logging
-import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
-from PIL import Image, ImageDraw
+from PIL import Image
 from pypdfium2 import PdfPage
 
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
@@ -93,7 +93,6 @@ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
-
         page_size = self.get_size()
 
         if not cropbox:
 
@@ -1,12 +1,8 @@
-# -*- coding: utf-8 -*-
-
 """
 Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
 On 23/01/2025
 """
 
-from __future__ import unicode_literals
-
 CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
 
 BLANK = ""
@@ -79,7 +75,6 @@
 }
 
 T = {
-    "\u2192": "\\rightarrow ",
     # Greek letters
     "\U0001d6fc": "\\alpha ",
     "\U0001d6fd": "\\beta ",
 
@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
         return default
 
 
-class Tag2Method(object):
-
+class Tag2Method:
     def call_method(self, elm, stag=None):
         getmethod = self.tag2meth.get
         if stag is None:
@@ -130,7 +129,6 @@ def process_unknow(self, elm, stag):
 
 
 class Pr(Tag2Method):
-
     text = ""
 
     __val_tags = ("chr", "pos", "begChr", "endChr", "type")
@@ -159,7 +157,7 @@ def do_brk(self, elm):
     def do_common(self, elm):
         stag = elm.tag.replace(OMML_NS, "")
         if stag in self.__val_tags:
-            t = elm.get("{0}val".format(OMML_NS))
+            t = elm.get(f"{OMML_NS}val")
             self.__innerdict[stag] = t
         return None
 
@@ -248,7 +246,6 @@ def do_spre(self, elm):
         """
         the Pre-Sub-Superscript object -- Not support yet
         """
-        pass
 
     def do_sub(self, elm):
         text = self.process_children(elm)
@@ -331,7 +328,7 @@ def do_limlow(self, elm):
         t_dict = self.process_children_dict(elm, include=("e", "lim"))
         latex_s = LIM_FUNC.get(t_dict["e"])
         if not latex_s:
-            raise NotSupport("Not support lim %s" % t_dict["e"])
+            raise RuntimeError("Not support lim {}".format(t_dict["e"]))
         else:
             return latex_s.format(lim=t_dict.get("lim"))
 
@@ -413,7 +410,7 @@ def do_r(self, elm):
         """
         _str = []
         _base_str = []
-        found_text = elm.findtext("./{0}t".format(OMML_NS))
+        found_text = elm.findtext(f"./{OMML_NS}t")
         if found_text:
             for s in found_text:
                 out_latex_str = self.process_unicode(s)
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def convert(self) -> DoclingDocument:`
`58`	`58`	`head = self.content.readline()`
`59`	`59`	`dialect = csv.Sniffer().sniff(head, ",;\t\|:")`
`60`	`60`	`_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')`
`61`		`- if not dialect.delimiter in {",", ";", "\t", "\|", ":"}:`
	`61`	`+ if dialect.delimiter not in {",", ";", "\t", "\|", ":"}:`
`62`	`62`	`raise RuntimeError(`
`63`	`63`	`f"Cannot convert csv with unknown delimiter {dialect.delimiter}."`
`64`	`64`	`)`