Add HTML support to DocumentIntelligenceConverter (#1352)
This commit is contained in:
@@ -84,6 +84,9 @@ def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[s
|
|||||||
prefixes.append(
|
prefixes.append(
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
)
|
)
|
||||||
|
elif type_ == DocumentIntelligenceFileType.HTML:
|
||||||
|
prefixes.append("text/html")
|
||||||
|
prefixes.append("application/xhtml+xml")
|
||||||
elif type_ == DocumentIntelligenceFileType.PDF:
|
elif type_ == DocumentIntelligenceFileType.PDF:
|
||||||
prefixes.append("application/pdf")
|
prefixes.append("application/pdf")
|
||||||
prefixes.append("application/x-pdf")
|
prefixes.append("application/x-pdf")
|
||||||
@@ -119,6 +122,8 @@ def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]
|
|||||||
extensions.append(".bmp")
|
extensions.append(".bmp")
|
||||||
elif type_ == DocumentIntelligenceFileType.TIFF:
|
elif type_ == DocumentIntelligenceFileType.TIFF:
|
||||||
extensions.append(".tiff")
|
extensions.append(".tiff")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.HTML:
|
||||||
|
extensions.append(".html")
|
||||||
return extensions
|
return extensions
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
26
packages/markitdown/tests/test_docintel_html.py
Normal file
26
packages/markitdown/tests/test_docintel_html.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
import io
|
||||||
|
from markitdown.converters._doc_intel_converter import (
|
||||||
|
DocumentIntelligenceConverter,
|
||||||
|
DocumentIntelligenceFileType,
|
||||||
|
)
|
||||||
|
from markitdown._stream_info import StreamInfo
|
||||||
|
|
||||||
|
|
||||||
|
def _make_converter(file_types):
|
||||||
|
conv = DocumentIntelligenceConverter.__new__(DocumentIntelligenceConverter)
|
||||||
|
conv._file_types = file_types
|
||||||
|
return conv
|
||||||
|
|
||||||
|
|
||||||
|
def test_docintel_accepts_html_extension():
|
||||||
|
conv = _make_converter([DocumentIntelligenceFileType.HTML])
|
||||||
|
stream_info = StreamInfo(mimetype=None, extension=".html")
|
||||||
|
assert conv.accepts(io.BytesIO(b""), stream_info)
|
||||||
|
|
||||||
|
|
||||||
|
def test_docintel_accepts_html_mimetype():
|
||||||
|
conv = _make_converter([DocumentIntelligenceFileType.HTML])
|
||||||
|
stream_info = StreamInfo(mimetype="text/html", extension=None)
|
||||||
|
assert conv.accepts(io.BytesIO(b""), stream_info)
|
||||||
|
stream_info = StreamInfo(mimetype="application/xhtml+xml", extension=None)
|
||||||
|
assert conv.accepts(io.BytesIO(b""), stream_info)
|
||||||
Reference in New Issue
Block a user