From ea1a3dfb60fde71c4b2fb0a6cfa1f46e9e741f7f Mon Sep 17 00:00:00 2001 From: safen0s <99965118+safen0s@users.noreply.github.com> Date: Tue, 26 Aug 2025 22:34:43 +0100 Subject: [PATCH] Add HTML support to DocumentIntelligenceConverter (#1352) --- .../converters/_doc_intel_converter.py | 5 ++++ .../markitdown/tests/test_docintel_html.py | 26 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 packages/markitdown/tests/test_docintel_html.py diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index ba66b5b..fd843f2 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -84,6 +84,9 @@ def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[s prefixes.append( "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) + elif type_ == DocumentIntelligenceFileType.HTML: + prefixes.append("text/html") + prefixes.append("application/xhtml+xml") elif type_ == DocumentIntelligenceFileType.PDF: prefixes.append("application/pdf") prefixes.append("application/x-pdf") @@ -119,6 +122,8 @@ def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str] extensions.append(".bmp") elif type_ == DocumentIntelligenceFileType.TIFF: extensions.append(".tiff") + elif type_ == DocumentIntelligenceFileType.HTML: + extensions.append(".html") return extensions diff --git a/packages/markitdown/tests/test_docintel_html.py b/packages/markitdown/tests/test_docintel_html.py new file mode 100644 index 0000000..d0b4caa --- /dev/null +++ b/packages/markitdown/tests/test_docintel_html.py @@ -0,0 +1,26 @@ +import io +from markitdown.converters._doc_intel_converter import ( + DocumentIntelligenceConverter, + DocumentIntelligenceFileType, +) +from markitdown._stream_info import StreamInfo + + +def _make_converter(file_types): + conv = DocumentIntelligenceConverter.__new__(DocumentIntelligenceConverter) + conv._file_types = file_types + return conv + + +def test_docintel_accepts_html_extension(): + conv = _make_converter([DocumentIntelligenceFileType.HTML]) + stream_info = StreamInfo(mimetype=None, extension=".html") + assert conv.accepts(io.BytesIO(b""), stream_info) + + +def test_docintel_accepts_html_mimetype(): + conv = _make_converter([DocumentIntelligenceFileType.HTML]) + stream_info = StreamInfo(mimetype="text/html", extension=None) + assert conv.accepts(io.BytesIO(b""), stream_info) + stream_info = StreamInfo(mimetype="application/xhtml+xml", extension=None) + assert conv.accepts(io.BytesIO(b""), stream_info)