Skip to content

Commit 984cb13

Browse files
authored
fix: guess HTML content starting with script tag (#1673)
Signed-off-by: Cesar Berrospi Ramis <[email protected]>
1 parent 3942923 commit 984cb13

File tree

2 files changed

+12
-1
lines changed

2 files changed

+12
-1
lines changed

docling/datamodel/document.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,11 @@ def _detect_html_xhtml(
412412
else:
413413
return "application/xml"
414414

415-
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
415+
if re.match(
416+
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
417+
content_str,
418+
re.DOTALL,
419+
):
416420
return "text/html"
417421

418422
p = re.compile(

tests/test_input_doc.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,13 @@ def test_guess_format(tmp_path):
132132
doc_path = Path("./tests/data/html/wiki_duck.html")
133133
assert dci._guess_format(doc_path) == InputFormat.HTML
134134

135+
html_str = ( # HTML starting with a script
136+
"<script>\nconsole.log('foo');\n</script>"
137+
'<!doctype html>\n<html lang="en-us class="no-js"></html>'
138+
)
139+
stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
140+
assert dci._guess_format(stream) == InputFormat.HTML
141+
135142
# Valid MD
136143
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
137144
stream = DocumentStream(name="wiki.md", stream=buf)

0 commit comments

Comments
 (0)