diff --git a/.gitattributes b/.gitattributes index d2f31ef..f787c0e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,2 @@ -tests/test_files/** linguist-vendored +packages/markitdown/tests/test_files/** linguist-vendored +packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored diff --git a/README.md b/README.md index 2563a68..5f9ef70 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,11 @@ > [!IMPORTANT] > Breaking changes between 0.0.1 to 0.0.2: > * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior. +> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything. -MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). -It supports: +MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption. + +At present, MarkItDown supports: - PDF - PowerPoint @@ -23,6 +25,17 @@ It supports: - Youtube URLs - ... and more! +## Why Markdown? + +Markdown is extremely close to plain text, with minimal markup or formatting, but still +provides a way to represent important document structure. Mainstream LLMs, such as +OpenAI's GPT-4o, natively "_speak_" Markdown, and often incorporate Markdown into their +responses unprompted. This suggests that they have been trained on vast amounts of +Markdown-formatted text, and understand it well. As a side benefit, Markdown conventions +are also highly token-efficient. + +## Installation + To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source: ```bash diff --git a/packages/markitdown-sample-plugin/README.md b/packages/markitdown-sample-plugin/README.md index 06324cd..fd7115f 100644 --- a/packages/markitdown-sample-plugin/README.md +++ b/packages/markitdown-sample-plugin/README.md @@ -10,23 +10,38 @@ This project shows how to create a sample plugin for MarkItDown. The most import Next, implement your custom DocumentConverter: ```python -from typing import Union -from markitdown import DocumentConverter, DocumentConverterResult +from typing import BinaryIO, Any +from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo class RtfConverter(DocumentConverter): - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not an RTF file - extension = kwargs.get("file_extension", "") - if extension.lower() != ".rtf": - return None - # Implement the conversion logic here ... + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) - # Return the result - return DocumentConverterResult( - title=title, - text_content=text_content, - ) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + + # Implement logic to check if the file stream is an RTF file + # ... + raise NotImplementedError() + + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + + # Implement logic to convert the file stream to Markdown + # ... + raise NotImplementedError() ``` Next, make sure your package implements and exports the following: @@ -71,10 +86,10 @@ Once the plugin package is installed, verify that it is available to MarkItDown markitdown --list-plugins ``` -To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert a PDF: +To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert an RTF file: ```bash -markitdown --use-plugins path-to-file.pdf +markitdown --use-plugins path-to-file.rtf ``` In Python, plugins can be enabled as follows: @@ -83,7 +98,7 @@ In Python, plugins can be enabled as follows: from markitdown import MarkItDown md = MarkItDown(enable_plugins=True) -result = md.convert("path-to-file.pdf") +result = md.convert("path-to-file.rtf") print(result.text_content) ``` diff --git a/packages/markitdown-sample-plugin/pyproject.toml b/packages/markitdown-sample-plugin/pyproject.toml index aaf2012..d8668aa 100644 --- a/packages/markitdown-sample-plugin/pyproject.toml +++ b/packages/markitdown-sample-plugin/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "markitdown", + "markitdown>=0.0.2a2", "striprtf", ] diff --git a/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py b/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py index fa67ccb..a365900 100644 --- a/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py +++ b/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.0.1a2" +__version__ = "0.0.1a3" diff --git a/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py b/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py index 98e660e..1362818 100644 --- a/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py +++ b/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py @@ -1,12 +1,26 @@ -from typing import Union +import locale +from typing import BinaryIO, Any from striprtf.striprtf import rtf_to_text -from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult +from markitdown import ( + MarkItDown, + DocumentConverter, + DocumentConverterResult, + StreamInfo, +) + __plugin_interface_version__ = ( 1 # The version of the plugin interface that this plugin uses ) +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/rtf", + "application/rtf", +] + +ACCEPTED_FILE_EXTENSIONS = [".rtf"] + def register_converters(markitdown: MarkItDown, **kwargs): """ @@ -22,18 +36,41 @@ class RtfConverter(DocumentConverter): Converts an RTF file to in the simplest possible way. """ - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a RTF - extension = kwargs.get("file_extension", "") - if extension.lower() != ".rtf": - return None + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) - # Read the RTF file - with open(local_path, "r") as f: - rtf = f.read() + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + # Read the file stream into an str using hte provided charset encoding, or using the system default + encoding = stream_info.charset or locale.getpreferredencoding() + stream_data = file_stream.read().decode(encoding) # Return the result return DocumentConverterResult( title=None, - text_content=rtf_to_text(rtf), + markdown=rtf_to_text(stream_data), ) diff --git a/packages/markitdown-sample-plugin/tests/test_sample_plugin.py b/packages/markitdown-sample-plugin/tests/test_sample_plugin.py index 49d54aa..6d0102d 100644 --- a/packages/markitdown-sample-plugin/tests/test_sample_plugin.py +++ b/packages/markitdown-sample-plugin/tests/test_sample_plugin.py @@ -2,7 +2,7 @@ import os import pytest -from markitdown import MarkItDown +from markitdown import MarkItDown, StreamInfo from markitdown_sample_plugin import RtfConverter TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") @@ -15,18 +15,22 @@ RTF_TEST_STRINGS = { def test_converter() -> None: """Tests the RTF converter dirctly.""" - converter = RtfConverter() - result = converter.convert( - os.path.join(TEST_FILES_DIR, "test.rtf"), file_extension=".rtf" - ) + with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream: + converter = RtfConverter() + result = converter.convert( + file_stream=file_stream, + stream_info=StreamInfo( + mimetype="text/rtf", extension=".rtf", filename="test.rtf" + ), + ) - for test_string in RTF_TEST_STRINGS: - assert test_string in result.text_content + for test_string in RTF_TEST_STRINGS: + assert test_string in result.text_content def test_markitdown() -> None: """Tests that MarkItDown correctly loads the plugin.""" - md = MarkItDown() + md = MarkItDown(enable_plugins=True) result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf")) for test_string in RTF_TEST_STRINGS: diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index c053c7b..d0f515e 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ dependencies = [ "beautifulsoup4", "requests", - "markdownify~=0.14.1", + "markdownify", "puremagic", "pathvalidate", "charset-normalizer", @@ -78,11 +78,14 @@ extra-dependencies = [ ] [tool.hatch.envs.types] +features = ["all"] extra-dependencies = [ + "openai", "mypy>=1.0.0", ] + [tool.hatch.envs.types.scripts] -check = "mypy --install-types --non-interactive {args:src/markitdown tests}" +check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}" [tool.coverage.run] source_pkgs = ["markitdown", "tests"] diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index dc5aafc..4ebb498 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.0.2a1" +__version__ = "0.0.2a2" diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py index 9f7db16..af356dd 100644 --- a/packages/markitdown/src/markitdown/__init__.py +++ b/packages/markitdown/src/markitdown/__init__.py @@ -3,7 +3,13 @@ # SPDX-License-Identifier: MIT from .__about__ import __version__ -from ._markitdown import MarkItDown +from ._markitdown import ( + MarkItDown, + PRIORITY_SPECIFIC_FILE_FORMAT, + PRIORITY_GENERIC_FILE_FORMAT, +) +from ._base_converter import DocumentConverterResult, DocumentConverter +from ._stream_info import StreamInfo from ._exceptions import ( MarkItDownException, MissingDependencyException, @@ -11,7 +17,6 @@ from ._exceptions import ( FileConversionException, UnsupportedFormatException, ) -from .converters import DocumentConverter, DocumentConverterResult __all__ = [ "__version__", @@ -23,4 +28,7 @@ __all__ = [ "FailedConversionAttempt", "FileConversionException", "UnsupportedFormatException", + "StreamInfo", + "PRIORITY_SPECIFIC_FILE_FORMAT", + "PRIORITY_GENERIC_FILE_FORMAT", ] diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py new file mode 100644 index 0000000..2f0ca9d --- /dev/null +++ b/packages/markitdown/src/markitdown/_base_converter.py @@ -0,0 +1,108 @@ +import os +import tempfile +from warnings import warn +from typing import Any, Union, BinaryIO, Optional, List +from ._stream_info import StreamInfo + + +class DocumentConverterResult: + """The result of converting a document to Markdown.""" + + def __init__( + self, + markdown: str, + *, + title: Optional[str] = None, + ): + """ + Initialize the DocumentConverterResult. + + The only required parameter is the converted Markdown text. + The title, and any other metadata that may be added in the future, are optional. + + Parameters: + - markdown: The converted Markdown text. + - title: Optional title of the document. + """ + self.markdown = markdown + self.title = title + + @property + def text_content(self) -> str: + """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" + return self.markdown + + @text_content.setter + def text_content(self, markdown: str): + """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" + self.markdown = markdown + + def __str__(self) -> str: + """Return the converted Markdown text.""" + return self.markdown + + +class DocumentConverter: + """Abstract superclass of all DocumentConverters.""" + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + """ + Return a quick determination on if the converter should attempt converting the document. + This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`). + In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to + make a determination (e.g., special converters for Wikipedia, YouTube etc). + Finally, it is conceivable that the `stream_info.filename` might be used to in cases + where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc) + + NOTE: The method signature is designed to match that of the convert() method. This provides some + assurance that, if accepts() returns True, the convert() method will also be able to handle the document. + + IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final + determination. Read operations inevitably advances the position in file_stream. In these case, the position + MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately + after accepts(), and will expect the file_stream to be at the original position. + + E.g., + cur_pos = file_stream.tell() # Save the current position + data = file_stream.read(100) # ... peek at the first 100 bytes, etc. + file_stream.seek(cur_pos) # Reset the position to the original position + + Prameters: + - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. + - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set) + - kwargs: Additional keyword arguments for the converter. + + Returns: + - bool: True if the converter can handle the document, False otherwise. + """ + raise NotImplementedError( + f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document." + ) + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + """ + Convert a document to Markdown text. + + Prameters: + - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. + - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set) + - kwargs: Additional keyword arguments for the converter. + + Returns: + - DocumentConverterResult: The result of the conversion, which includes the title and markdown content. + + Raises: + - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason. + - MissingDependencyException: If the converter requires a dependency that is not installed. + """ + raise NotImplementedError("Subclasses must implement this method") diff --git a/packages/markitdown/src/markitdown/_exceptions.py b/packages/markitdown/src/markitdown/_exceptions.py index abfebc6..93f8f0e 100644 --- a/packages/markitdown/src/markitdown/_exceptions.py +++ b/packages/markitdown/src/markitdown/_exceptions.py @@ -68,6 +68,9 @@ class FileConversionException(MarkItDownException): else: message = f"File conversion failed after {len(attempts)} attempts:\n" for attempt in attempts: - message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n" + if attempt.exc_info is None: + message += " - {type(attempt.converter).__name__} provided no execution info." + else: + message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n" super().__init__(message) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 8f1bd46..6086eb9 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -6,8 +6,10 @@ import sys import tempfile import warnings import traceback +import io +from dataclasses import dataclass from importlib.metadata import entry_points -from typing import Any, List, Optional, Union +from typing import Any, List, Optional, Union, BinaryIO from pathlib import Path from urllib.parse import urlparse from warnings import warn @@ -16,9 +18,9 @@ from warnings import warn import puremagic import requests +from ._stream_info import StreamInfo, _guess_stream_info_from_stream + from .converters import ( - DocumentConverter, - DocumentConverterResult, PlainTextConverter, HtmlConverter, RssConverter, @@ -32,26 +34,34 @@ from .converters import ( XlsConverter, PptxConverter, ImageConverter, - WavConverter, - Mp3Converter, + AudioConverter, OutlookMsgConverter, ZipConverter, DocumentIntelligenceConverter, ) +from ._base_converter import DocumentConverter, DocumentConverterResult + from ._exceptions import ( FileConversionException, UnsupportedFormatException, FailedConversionAttempt, ) -# Override mimetype for csv to fix issue on windows -mimetypes.add_type("text/csv", ".csv") -_plugins: Union[None | List[Any]] = None +# Lower priority values are tried first. +PRIORITY_SPECIFIC_FILE_FORMAT = ( + 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia +) +PRIORITY_GENERIC_FILE_FORMAT = ( + 10.0 # Near catch-all converters for mimetypes like text/*, etc. +) -def _load_plugins() -> Union[None | List[Any]]: +_plugins: List[Any] = [] + + +def _load_plugins() -> List[Any]: """Lazy load plugins, exiting early if already loaded.""" global _plugins @@ -71,6 +81,14 @@ def _load_plugins() -> Union[None | List[Any]]: return _plugins +@dataclass(kw_only=True, frozen=True) +class ConverterRegistration: + """A registration of a converter with its priority and other metadata.""" + + converter: DocumentConverter + priority: float + + class MarkItDown: """(In preview) An extremely simple text-based document reader, suitable for LLM use. This reader will convert common file-types or webpages to Markdown.""" @@ -92,13 +110,13 @@ class MarkItDown: self._requests_session = requests_session # TODO - remove these (see enable_builtins) - self._llm_client = None - self._llm_model = None - self._exiftool_path = None - self._style_map = None + self._llm_client: Any = None + self._llm_model: Union[str | None] = None + self._exiftool_path: Union[str | None] = None + self._style_map: Union[str | None] = None # Register the converters - self._page_converters: List[DocumentConverter] = [] + self._converters: List[ConverterRegistration] = [] if ( enable_builtins is None or enable_builtins @@ -126,9 +144,15 @@ class MarkItDown: # Register converters for successful browsing operations # Later registrations are tried first / take higher priority than earlier registrations # To this end, the most specific converters should appear below the most generic converters - self.register_converter(PlainTextConverter()) - self.register_converter(ZipConverter()) - self.register_converter(HtmlConverter()) + self.register_converter( + PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT + ) + self.register_converter( + ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT + ) + self.register_converter( + HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT + ) self.register_converter(RssConverter()) self.register_converter(WikipediaConverter()) self.register_converter(YouTubeConverter()) @@ -137,8 +161,7 @@ class MarkItDown: self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) - self.register_converter(WavConverter()) - self.register_converter(Mp3Converter()) + self.register_converter(AudioConverter()) self.register_converter(ImageConverter()) self.register_converter(IpynbConverter()) self.register_converter(PdfConverter()) @@ -174,12 +197,17 @@ class MarkItDown: warn("Plugins converters are already enabled.", RuntimeWarning) def convert( - self, source: Union[str, requests.Response, Path], **kwargs: Any + self, + source: Union[str, requests.Response, Path, BinaryIO], + *, + stream_info: Optional[StreamInfo] = None, + **kwargs: Any, ) -> DocumentConverterResult: # TODO: deal with kwargs """ Args: - - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object - - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) + - source: can be a path (str or Path), url, or a requests.response object + - stream_info: optional stream info to use for the conversion. If None, infer from source + - kwargs: additional arguments to pass to the converter """ # Local path or url @@ -191,68 +219,120 @@ class MarkItDown: ): return self.convert_url(source, **kwargs) else: - return self.convert_local(source, **kwargs) + return self.convert_local(source, stream_info=stream_info, **kwargs) + # Path object + elif isinstance(source, Path): + return self.convert_local(source, stream_info=stream_info, **kwargs) # Request response elif isinstance(source, requests.Response): return self.convert_response(source, **kwargs) - elif isinstance(source, Path): - return self.convert_local(source, **kwargs) + # Binary stream + elif ( + hasattr(source, "read") + and callable(source.read) + and not isinstance(source, io.TextIOBase) + ): + return self.convert_stream(source, **kwargs) + else: + raise TypeError( + f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO." + ) def convert_local( - self, path: Union[str, Path], **kwargs: Any - ) -> DocumentConverterResult: # TODO: deal with kwargs + self, + path: Union[str, Path], + *, + stream_info: Optional[StreamInfo] = None, + file_extension: Optional[str] = None, # Deprecated -- use stream_info + url: Optional[str] = None, # Deprecated -- use stream_info + **kwargs: Any, + ) -> DocumentConverterResult: if isinstance(path, Path): path = str(path) - # Prepare a list of extensions to try (in order of priority) - ext = kwargs.get("file_extension") - extensions = [ext] if ext is not None else [] - # Get extension alternatives from the path and puremagic - base, ext = os.path.splitext(path) - self._append_ext(extensions, ext) + # Build a base StreamInfo object from which to start guesses + base_stream_info = StreamInfo( + local_path=path, + extension=os.path.splitext(path)[1], + filename=os.path.basename(path), + ) - for g in self._guess_ext_magic(path): - self._append_ext(extensions, g) + # Extend the base_stream_info with any additional info from the arguments + if stream_info is not None: + base_stream_info = base_stream_info.copy_and_update(stream_info) - # Convert - return self._convert(path, extensions, **kwargs) + if file_extension is not None: + # Deprecated -- use stream_info + base_stream_info = base_stream_info.copy_and_update( + extension=file_extension + ) + + if url is not None: + # Deprecated -- use stream_info + base_stream_info = base_stream_info.copy_and_update(url=url) + + with open(path, "rb") as fh: + # Prepare a list of configurations to try, starting with the base_stream_info + guesses: List[StreamInfo] = [base_stream_info] + for guess in _guess_stream_info_from_stream( + file_stream=fh, filename_hint=path + ): + guesses.append(base_stream_info.copy_and_update(guess)) + return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs) - # TODO what should stream's type be? def convert_stream( - self, stream: Any, **kwargs: Any - ) -> DocumentConverterResult: # TODO: deal with kwargs - # Prepare a list of extensions to try (in order of priority) - ext = kwargs.get("file_extension") - extensions = [ext] if ext is not None else [] + self, + stream: BinaryIO, + *, + stream_info: Optional[StreamInfo] = None, + file_extension: Optional[str] = None, # Deprecated -- use stream_info + url: Optional[str] = None, # Deprecated -- use stream_info + **kwargs: Any, + ) -> DocumentConverterResult: + guesses: List[StreamInfo] = [] - # Save the file locally to a temporary file. It will be deleted before this method exits - handle, temp_path = tempfile.mkstemp() - fh = os.fdopen(handle, "wb") - result = None - try: - # Write to the temporary file - content = stream.read() - if isinstance(content, str): - fh.write(content.encode("utf-8")) + # Do we have anything on which to base a guess? + base_guess = None + if stream_info is not None or file_extension is not None or url is not None: + # Start with a non-Null base guess + if stream_info is None: + base_guess = StreamInfo() else: - fh.write(content) - fh.close() + base_guess = stream_info - # Use puremagic to check for more extension options - for g in self._guess_ext_magic(temp_path): - self._append_ext(extensions, g) + if file_extension is not None: + # Deprecated -- use stream_info + assert base_guess is not None # for mypy + base_guess = base_guess.copy_and_update(extension=file_extension) - # Convert - result = self._convert(temp_path, extensions, **kwargs) - # Clean up - finally: - try: - fh.close() - except Exception: - pass - os.unlink(temp_path) + if url is not None: + # Deprecated -- use stream_info + assert base_guess is not None # for mypy + base_guess = base_guess.copy_and_update(url=url) - return result + # Append the base guess, if it's non-trivial + if base_guess is not None: + if base_guess.mimetype is not None or base_guess.extension is not None: + guesses.append(base_guess) + else: + # Create a base guess with no information + base_guess = StreamInfo() + + # Create a placeholder filename to help with guessing + placeholder_filename = None + if base_guess.filename is not None: + placeholder_filename = base_guess.filename + elif base_guess.extension is not None: + placeholder_filename = "placeholder" + base_guess.extension + + # Add guesses based on stream content + for guess in _guess_stream_info_from_stream( + file_stream=stream, filename_hint=placeholder_filename + ): + guesses.append(base_guess.copy_and_update(guess)) + + # Perform the conversion + return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs) def convert_url( self, url: str, **kwargs: Any @@ -263,55 +343,94 @@ class MarkItDown: return self.convert_response(response, **kwargs) def convert_response( - self, response: requests.Response, **kwargs: Any - ) -> DocumentConverterResult: # TODO fix kwargs type - # Prepare a list of extensions to try (in order of priority) - ext = kwargs.get("file_extension") - extensions = [ext] if ext is not None else [] + self, + response: requests.Response, + *, + stream_info: Optional[StreamInfo] = None, + file_extension: Optional[str] = None, # Deprecated -- use stream_info + url: Optional[str] = None, # Deprecated -- use stream_info + **kwargs: Any, + ) -> DocumentConverterResult: + # If there is a content-type header, get the mimetype and charset (if present) + mimetype: Optional[str] = None + charset: Optional[str] = None - # Guess from the mimetype - content_type = response.headers.get("content-type", "").split(";")[0] - self._append_ext(extensions, mimetypes.guess_extension(content_type)) + if "content-type" in response.headers: + parts = response.headers["content-type"].split(";") + mimetype = parts.pop(0).strip() + for part in parts: + if part.strip().startswith("charset="): + _charset = part.split("=")[1].strip() + if len(_charset) > 0: + charset = _charset - # Read the content disposition if there is one - content_disposition = response.headers.get("content-disposition", "") - m = re.search(r"filename=([^;]+)", content_disposition) - if m: - base, ext = os.path.splitext(m.group(1).strip("\"'")) - self._append_ext(extensions, ext) + # If there is a content-disposition header, get the filename and possibly the extension + filename: Optional[str] = None + extension: Optional[str] = None + if "content-disposition" in response.headers: + m = re.search(r"filename=([^;]+)", response.headers["content-disposition"]) + if m: + filename = m.group(1).strip("\"'") + _, _extension = os.path.splitext(filename) + if len(_extension) > 0: + extension = _extension - # Read from the extension from the path - base, ext = os.path.splitext(urlparse(response.url).path) - self._append_ext(extensions, ext) + # If there is still no filename, try to read it from the url + if filename is None: + parsed_url = urlparse(response.url) + _, _extension = os.path.splitext(parsed_url.path) + if len(_extension) > 0: # Looks like this might be a file! + filename = os.path.basename(parsed_url.path) + extension = _extension - # Save the file locally to a temporary file. It will be deleted before this method exits - handle, temp_path = tempfile.mkstemp() - fh = os.fdopen(handle, "wb") - result = None - try: - # Download the file - for chunk in response.iter_content(chunk_size=512): - fh.write(chunk) - fh.close() + # Create an initial guess from all this information + base_guess = StreamInfo( + mimetype=mimetype, + charset=charset, + filename=filename, + extension=extension, + url=response.url, + ) - # Use puremagic to check for more extension options - for g in self._guess_ext_magic(temp_path): - self._append_ext(extensions, g) + # Update with any additional info from the arguments + if stream_info is not None: + base_guess = base_guess.copy_and_update(stream_info) + if file_extension is not None: + # Deprecated -- use stream_info + base_guess = base_guess.copy_and_update(extension=file_extension) + if url is not None: + # Deprecated -- use stream_info + base_guess = base_guess.copy_and_update(url=url) - # Convert - result = self._convert(temp_path, extensions, url=response.url, **kwargs) - # Clean up - finally: - try: - fh.close() - except Exception: - pass - os.unlink(temp_path) + # Add the guess if its non-trivial + guesses: List[StreamInfo] = [] + if base_guess.mimetype is not None or base_guess.extension is not None: + guesses.append(base_guess) - return result + # Read into BytesIO + buffer = io.BytesIO() + for chunk in response.iter_content(chunk_size=512): + buffer.write(chunk) + buffer.seek(0) + + # Create a placeholder filename to help with guessing + placeholder_filename = None + if base_guess.filename is not None: + placeholder_filename = base_guess.filename + elif base_guess.extension is not None: + placeholder_filename = "placeholder" + base_guess.extension + + # Add guesses based on stream content + for guess in _guess_stream_info_from_stream( + file_stream=buffer, filename_hint=placeholder_filename + ): + guesses.append(base_guess.copy_and_update(guess)) + + # Convert + return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs) def _convert( - self, local_path: str, extensions: List[Union[str, None]], **kwargs + self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs ) -> DocumentConverterResult: res: Union[None, DocumentConverterResult] = None @@ -321,19 +440,21 @@ class MarkItDown: # Create a copy of the page_converters list, sorted by priority. # We do this with each call to _convert because the priority of converters may change between calls. # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. - sorted_converters = sorted(self._page_converters, key=lambda x: x.priority) + sorted_registrations = sorted(self._converters, key=lambda x: x.priority) + + # Remember the initial stream position so that we can return to it + cur_pos = file_stream.tell() + + for stream_info in stream_info_guesses + [StreamInfo()]: + for converter_registration in sorted_registrations: + converter = converter_registration.converter + # Sanity check -- make sure the cur_pos is still the same + assert ( + cur_pos == file_stream.tell() + ), f"File stream position should NOT change between guess iterations" - for ext in extensions + [None]: # Try last with no extension - for converter in sorted_converters: _kwargs = copy.deepcopy(kwargs) - # Overwrite file_extension appropriately - if ext is None: - if "file_extension" in _kwargs: - del _kwargs["file_extension"] - else: - _kwargs.update({"file_extension": ext}) - # Copy any additional global options if "llm_client" not in _kwargs and self._llm_client is not None: _kwargs["llm_client"] = self._llm_client @@ -348,17 +469,40 @@ class MarkItDown: _kwargs["exiftool_path"] = self._exiftool_path # Add the list of converters for nested processing - _kwargs["_parent_converters"] = self._page_converters + _kwargs["_parent_converters"] = self._converters - # If we hit an error log it and keep trying + # Add legaxy kwargs + if stream_info is not None: + if stream_info.extension is not None: + _kwargs["file_extension"] = stream_info.extension + + if stream_info.url is not None: + _kwargs["url"] = stream_info.url + + # Check if the converter will accept the file, and if so, try to convert it + _accepts = False try: - res = converter.convert(local_path, **_kwargs) - except Exception: - failed_attempts.append( - FailedConversionAttempt( - converter=converter, exc_info=sys.exc_info() + _accepts = converter.accepts(file_stream, stream_info, **_kwargs) + except NotImplementedError: + pass + + # accept() should not have changed the file stream position + assert ( + cur_pos == file_stream.tell() + ), f"{type(converter).__name__}.accept() should NOT change the file_stream position" + + # Attempt the conversion + if _accepts: + try: + res = converter.convert(file_stream, stream_info, **_kwargs) + except Exception: + failed_attempts.append( + FailedConversionAttempt( + converter=converter, exc_info=sys.exc_info() + ) ) - ) + finally: + file_stream.seek(cur_pos) if res is not None: # Normalize the content @@ -366,8 +510,6 @@ class MarkItDown: [line.rstrip() for line in re.split(r"\r?\n", res.text_content)] ) res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) - - # Todo return res # If we got this far without success, report any exceptions @@ -376,61 +518,9 @@ class MarkItDown: # Nothing can handle it! raise UnsupportedFormatException( - f"Could not convert '{local_path}' to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported." + f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported." ) - def _append_ext(self, extensions, ext): - """Append a unique non-None, non-empty extension to a list of extensions.""" - if ext is None: - return - ext = ext.strip() - if ext == "": - return - if ext in extensions: - return - extensions.append(ext) - - def _guess_ext_magic(self, path): - """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes.""" - # Use puremagic to guess - try: - guesses = puremagic.magic_file(path) - - # Fix for: https://github.com/microsoft/markitdown/issues/222 - # If there are no guesses, then try again after trimming leading ASCII whitespaces. - # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f' - # (space, tab, newline, carriage return, vertical tab, form feed). - if len(guesses) == 0: - with open(path, "rb") as file: - while True: - char = file.read(1) - if not char: # End of file - break - if not char.isspace(): - file.seek(file.tell() - 1) - break - try: - guesses = puremagic.magic_stream(file) - except puremagic.main.PureError: - pass - - extensions = list() - for g in guesses: - ext = g.extension.strip() - if len(ext) > 0: - if not ext.startswith("."): - ext = "." + ext - if ext not in extensions: - extensions.append(ext) - return extensions - except FileNotFoundError: - pass - except IsADirectoryError: - pass - except PermissionError: - pass - return [] - def register_page_converter(self, converter: DocumentConverter) -> None: """DEPRECATED: User register_converter instead.""" warn( @@ -439,6 +529,34 @@ class MarkItDown: ) self.register_converter(converter) - def register_converter(self, converter: DocumentConverter) -> None: - """Register a page text converter.""" - self._page_converters.insert(0, converter) + def register_converter( + self, + converter: DocumentConverter, + *, + priority: float = PRIORITY_SPECIFIC_FILE_FORMAT, + ) -> None: + """ + Register a DocumentConverter with a given priority. + + Priorities work as follows: By default, most converters get priority + DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception + is the PlainTextConverter, HtmlConverter, and ZipConverter, which get + priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values + being tried first (i.e., higher priority). + + Just prior to conversion, the converters are sorted by priority, using + a stable sort. This means that converters with the same priority will + remain in the same order, with the most recently registered converters + appearing first. + + We have tight control over the order of built-in converters, but + plugins can register converters in any order. The registration's priority + field reasserts some control over the order of converters. + + Plugins can register converters with any priority, to appear before or + after the built-ins. For example, a plugin with priority 9 will run + before the PlainTextConverter, but after the built-in converters. + """ + self._converters.insert( + 0, ConverterRegistration(converter=converter, priority=priority) + ) diff --git a/packages/markitdown/src/markitdown/_stream_info.py b/packages/markitdown/src/markitdown/_stream_info.py new file mode 100644 index 0000000..1eaa4d2 --- /dev/null +++ b/packages/markitdown/src/markitdown/_stream_info.py @@ -0,0 +1,122 @@ +import puremagic +import mimetypes +import os +from dataclasses import dataclass, asdict +from typing import Optional, BinaryIO, List, TypeVar, Type + +# Mimetype substitutions table +MIMETYPE_SUBSTITUTIONS = { + "application/excel": "application/vnd.ms-excel", + "application/mspowerpoint": "application/vnd.ms-powerpoint", +} + + +@dataclass(kw_only=True, frozen=True) +class StreamInfo: + """The StreamInfo class is used to store information about a file stream. + All fields can be None, and will depend on how the stream was opened. + """ + + mimetype: Optional[str] = None + extension: Optional[str] = None + charset: Optional[str] = None + filename: Optional[ + str + ] = None # From local path, url, or Content-Disposition header + local_path: Optional[str] = None # If read from disk + url: Optional[str] = None # If read from url + + def copy_and_update(self, *args, **kwargs): + """Copy the StreamInfo object and update it with the given StreamInfo + instance and/or other keyword arguments.""" + new_info = asdict(self) + + for si in args: + assert isinstance(si, StreamInfo) + new_info.update({k: v for k, v in asdict(si).items() if v is not None}) + + if len(kwargs) > 0: + new_info.update(kwargs) + + return StreamInfo(**new_info) + + +# Behavior subject to change. +# Do not rely on this outside of this module. +def _guess_stream_info_from_stream( + file_stream: BinaryIO, + *, + filename_hint: Optional[str] = None, +) -> List[StreamInfo]: + """ + Guess StreamInfo properties (mostly mimetype and extension) from a stream. + + Args: + - stream: The stream to guess the StreamInfo from. + - filename_hint [Optional]: A filename hint to help with the guessing (may be a placeholder, and not actually be the file name) + + Returns a list of StreamInfo objects in order of confidence. + """ + guesses: List[StreamInfo] = [] + + # Add a guess purely based on the filename hint + if filename_hint: + try: + # Requires Python 3.13+ + mimetype, _ = mimetypes.guess_file_type(filename_hint) # type: ignore + except AttributeError: + mimetype, _ = mimetypes.guess_type(filename_hint) + + if mimetype: + guesses.append( + StreamInfo( + mimetype=mimetype, extension=os.path.splitext(filename_hint)[1] + ) + ) + + def _puremagic( + file_stream, filename_hint + ) -> List[puremagic.main.PureMagicWithConfidence]: + """Wrap guesses to handle exceptions.""" + try: + return puremagic.magic_stream(file_stream, filename=filename_hint) + except puremagic.main.PureError as e: + return [] + + cur_pos = file_stream.tell() + type_guesses = _puremagic(file_stream, filename_hint=filename_hint) + if len(type_guesses) == 0: + # Fix for: https://github.com/microsoft/markitdown/issues/222 + # If there are no guesses, then try again after trimming leading ASCII whitespaces. + # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f' + # (space, tab, newline, carriage return, vertical tab, form feed). + + # Eat all the leading whitespace + file_stream.seek(cur_pos) + while True: + char = file_stream.read(1) + if not char: # End of file + break + if not char.isspace(): + file_stream.seek(file_stream.tell() - 1) + break + + # Try again + type_guesses = _puremagic(file_stream, filename_hint=filename_hint) + file_stream.seek(cur_pos) + + # Convert and return the guesses + for guess in type_guesses: + kwargs: dict[str, str] = {} + if guess.extension: + kwargs["extension"] = guess.extension + if guess.mime_type: + kwargs["mimetype"] = MIMETYPE_SUBSTITUTIONS.get( + guess.mime_type, guess.mime_type + ) + if len(kwargs) > 0: + # We don't add the filename_hint, because sometimes it's just a placeholder, + # and, in any case, doesn't add new information. + guesses.append(StreamInfo(**kwargs)) + + return guesses diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index 1e5afe4..f43efe3 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: MIT -from ._base import DocumentConverter, DocumentConverterResult from ._plain_text_converter import PlainTextConverter from ._html_converter import HtmlConverter from ._rss_converter import RssConverter @@ -15,15 +14,12 @@ from ._docx_converter import DocxConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter -from ._wav_converter import WavConverter -from ._mp3_converter import Mp3Converter +from ._audio_converter import AudioConverter from ._outlook_msg_converter import OutlookMsgConverter from ._zip_converter import ZipConverter from ._doc_intel_converter import DocumentIntelligenceConverter __all__ = [ - "DocumentConverter", - "DocumentConverterResult", "PlainTextConverter", "HtmlConverter", "RssConverter", @@ -37,8 +33,7 @@ __all__ = [ "XlsConverter", "PptxConverter", "ImageConverter", - "WavConverter", - "Mp3Converter", + "AudioConverter", "OutlookMsgConverter", "ZipConverter", "DocumentIntelligenceConverter", diff --git a/packages/markitdown/src/markitdown/converters/_audio_converter.py b/packages/markitdown/src/markitdown/converters/_audio_converter.py new file mode 100644 index 0000000..845ad5d --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_audio_converter.py @@ -0,0 +1,102 @@ +import io +from typing import Any, BinaryIO, Optional + +from ._exiftool import exiftool_metadata +from ._transcribe_audio import transcribe_audio +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from .._exceptions import MissingDependencyException + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "audio/x-wav", + "audio/mpeg", + "video/mp4", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".wav", + ".mp3", + ".m4a", + ".mp4", +] + + +class AudioConverter(DocumentConverter): + """ + Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). + """ + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + md_content = "" + + # Add metadata + metadata = exiftool_metadata( + file_stream, exiftool_path=kwargs.get("exiftool_path") + ) + if metadata: + for f in [ + "Title", + "Artist", + "Author", + "Band", + "Album", + "Genre", + "Track", + "DateTimeOriginal", + "CreateDate", + # "Duration", -- Wrong values when read from memory + "NumChannels", + "SampleRate", + "AvgBytesPerSec", + "BitsPerSample", + ]: + if f in metadata: + md_content += f"{f}: {metadata[f]}\n" + + # Figure out the audio format for transcription + if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav": + audio_format = "wav" + elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg": + audio_format = "mp3" + elif ( + stream_info.extension in [".mp4", ".m4a"] + or stream_info.mimetype == "video/mp4" + ): + audio_format = "mp4" + else: + audio_format = None + + # Transcribe + if audio_format: + try: + transcript = transcribe_audio(file_stream, audio_format=audio_format) + if transcript: + md_content += "\n\n### Audio Transcript:\n" + transcript + except MissingDependencyException: + pass + + # Return the result + return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitdown/src/markitdown/converters/_base.py b/packages/markitdown/src/markitdown/converters/_base.py deleted file mode 100644 index 0f351fc..0000000 --- a/packages/markitdown/src/markitdown/converters/_base.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import Any, Union - - -class DocumentConverterResult: - """The result of converting a document to text.""" - - def __init__(self, title: Union[str, None] = None, text_content: str = ""): - self.title: Union[str, None] = title - self.text_content: str = text_content - - -class DocumentConverter: - """Abstract superclass of all DocumentConverters.""" - - # Lower priority values are tried first. - PRIORITY_SPECIFIC_FILE_FORMAT = ( - 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia - ) - PRIORITY_GENERIC_FILE_FORMAT = ( - 10.0 # Near catch-all converters for mimetypes like text/*, etc. - ) - - def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT): - """ - Initialize the DocumentConverter with a given priority. - - Priorities work as follows: By default, most converters get priority - DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception - is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), - with lower values being tried first (i.e., higher priority). - - Just prior to conversion, the converters are sorted by priority, using - a stable sort. This means that converters with the same priority will - remain in the same order, with the most recently registered converters - appearing first. - - We have tight control over the order of built-in converters, but - plugins can register converters in any order. A converter's priority - field reasserts some control over the order of converters. - - Plugins can register converters with any priority, to appear before or - after the built-ins. For example, a plugin with priority 9 will run - before the PlainTextConverter, but after the built-in converters. - """ - self._priority = priority - - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - raise NotImplementedError("Subclasses must implement this method") - - @property - def priority(self) -> float: - """Priority of the converter in markitdown's converter list. Higher priority values are tried first.""" - return self._priority - - @priority.setter - def priority(self, value: float): - self._priority = value - - @priority.deleter - def priority(self): - raise AttributeError("Cannot delete the priority attribute") diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index d1b11a6..7dd9e24 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -1,14 +1,24 @@ -# type: ignore -import base64 +import io import re - -from typing import Union +import base64 from urllib.parse import parse_qs, urlparse +from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup -from ._base import DocumentConverter, DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from ._markdownify import _CustomMarkdownify +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/html", + "application/xhtml", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".html", + ".htm", +] + class BingSerpConverter(DocumentConverter): """ @@ -16,28 +26,47 @@ class BingSerpConverter(DocumentConverter): NOTE: It is better to use the Bing API """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + """ + Make sure we're dealing with HTML content *from* Bing. + """ + + url = stream_info.url or "" + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a Bing SERP - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".html", ".htm"]: - return None - url = kwargs.get("url", "") if not re.search(r"^https://www\.bing\.com/search\?q=", url): - return None + # Not a Bing SERP URL + return False + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + # Not HTML content + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Parse the query parameters - parsed_params = parse_qs(urlparse(url).query) + parsed_params = parse_qs(urlparse(stream_info.url).query) query = parsed_params.get("q", [""])[0] - # Parse the file - soup = None - with open(local_path, "rt", encoding="utf-8") as fh: - soup = BeautifulSoup(fh.read(), "html.parser") + # Parse the stream + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Clean up some formatting for tptt in soup.find_all(class_="tptt"): @@ -81,6 +110,6 @@ class BingSerpConverter(DocumentConverter): ) return DocumentConverterResult( + markdown=webpage_text, title=None if soup.title is None else soup.title.string, - text_content=webpage_text, ) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 6fe79c0..2f116d0 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -1,9 +1,12 @@ -from typing import Any, Union -import re import sys +import re -from ._base import DocumentConverter, DocumentConverterResult -from .._exceptions import MissingDependencyException +from typing import BinaryIO, Any, List + +from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later @@ -26,17 +29,50 @@ except ImportError: CONTENT_FORMAT = "markdown" +OFFICE_MIME_TYPE_PREFIXES = [ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.openxmlformats-officedocument.presentationml", + "application/xhtml", + "text/html", +] + +OTHER_MIME_TYPE_PREFIXES = [ + "application/pdf", + "application/x-pdf", + "text/html", + "image/", +] + +OFFICE_FILE_EXTENSIONS = [ + ".docx", + ".xlsx", + ".pptx", + ".html", + ".htm", +] + +OTHER_FILE_EXTENSIONS = [ + ".pdf", + ".jpeg", + ".jpg", + ".png", + ".bmp", + ".tiff", + ".heif", +] + + class DocumentIntelligenceConverter(DocumentConverter): """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" def __init__( self, *, - priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT, endpoint: str, api_version: str = "2024-07-31-preview", ): - super().__init__(priority=priority) + super().__init__() # Raise an error if the dependencies are not available. # This is different than other converters since this one isn't even instantiated @@ -44,9 +80,11 @@ class DocumentIntelligenceConverter(DocumentConverter): if _dependency_exc_info is not None: raise MissingDependencyException( "DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`" - ) from _dependency_exc_info[1].with_traceback( + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] - ) # Restore the original traceback + ) self.endpoint = endpoint self.api_version = api_version @@ -55,55 +93,62 @@ class DocumentIntelligenceConverter(DocumentConverter): api_version=self.api_version, credential=DefaultAzureCredential(), ) - self._priority = priority + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS: + return True + + for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def _analysis_features(self, stream_info: StreamInfo) -> List[str]: + """ + Helper needed to determine which analysis features to use. + Certain document analysis features are not availiable for + office filetypes (.xlsx, .pptx, .html, .docx) + """ + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in OFFICE_FILE_EXTENSIONS: + return [] + + for prefix in OFFICE_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return [] + + return [ + DocumentAnalysisFeature.FORMULAS, # enable formula extraction + DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR + DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction + ] def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if extension is not supported by Document Intelligence - extension = kwargs.get("file_extension", "") - docintel_extensions = [ - ".pdf", - ".docx", - ".xlsx", - ".pptx", - ".html", - ".jpeg", - ".jpg", - ".png", - ".bmp", - ".tiff", - ".heif", - ] - if extension.lower() not in docintel_extensions: - return None - - # Get the bytestring for the local path - with open(local_path, "rb") as f: - file_bytes = f.read() - - # Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx) - if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]: - analysis_features = [] - else: - analysis_features = [ - DocumentAnalysisFeature.FORMULAS, # enable formula extraction - DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR - DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction - ] - + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Extract the text using Azure Document Intelligence poller = self.doc_intel_client.begin_analyze_document( model_id="prebuilt-layout", - body=AnalyzeDocumentRequest(bytes_source=file_bytes), - features=analysis_features, + body=AnalyzeDocumentRequest(bytes_source=file_stream.read()), + features=self._analysis_features(stream_info), output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed ) result: AnalyzeResult = poller.result() # remove comments from the markdown content generated by Doc Intelligence and append to markdown string markdown_text = re.sub(r"", "", result.content, flags=re.DOTALL) - return DocumentConverterResult( - title=None, - text_content=markdown_text, - ) + return DocumentConverterResult(markdown=markdown_text) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 0866e59..c568acb 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -1,13 +1,10 @@ import sys -from typing import Union +from typing import BinaryIO, Any -from ._base import ( - DocumentConverterResult, -) - -from ._base import DocumentConverter from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies @@ -20,22 +17,46 @@ except ImportError: _dependency_exc_info = sys.exc_info() +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", +] + +ACCEPTED_FILE_EXTENSIONS = [".docx"] + + class DocxConverter(HtmlConverter): """ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def __init__(self): + super().__init__() + self._html_converter = HtmlConverter() - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a DOCX - extension = kwargs.get("file_extension", "") - if extension.lower() != ".docx": - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Check: the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( @@ -44,16 +65,13 @@ class DocxConverter(HtmlConverter): extension=".docx", feature="docx", ) - ) from _dependency_exc_info[1].with_traceback( + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] - ) # Restore the original traceback + ) - result = None - with open(local_path, "rb") as docx_file: - style_map = kwargs.get("style_map", None) - - result = mammoth.convert_to_html(docx_file, style_map=style_map) - html_content = result.value - result = self._convert(html_content) - - return result + style_map = kwargs.get("style_map", None) + return self._html_converter.convert_string( + mammoth.convert_to_html(file_stream, style_map=style_map).value + ) diff --git a/packages/markitdown/src/markitdown/converters/_exiftool.py b/packages/markitdown/src/markitdown/converters/_exiftool.py new file mode 100644 index 0000000..5a316f0 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_exiftool.py @@ -0,0 +1,44 @@ +import json +import subprocess +import locale +import sys +import shutil +import os +import warnings +from typing import BinaryIO, Optional, Any + + +def exiftool_metadata( + file_stream: BinaryIO, *, exiftool_path: Optional[str] = None +) -> Any: # Need a better type for json data + # Check if we have a valid pointer to exiftool + if not exiftool_path: + which_exiftool = shutil.which("exiftool") + if which_exiftool: + warnings.warn( + f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., + + md = MarkItDown(exiftool_path="{which_exiftool}") + +This warning will be removed in future releases. +""", + DeprecationWarning, + ) + # Nothing to do + return {} + + # Run exiftool + cur_pos = file_stream.tell() + try: + output = subprocess.run( + [exiftool_path, "-json", "-"], + input=file_stream.read(), + capture_output=True, + text=False, + ).stdout + + return json.loads( + output.decode(locale.getpreferredencoding(False)), + )[0] + finally: + file_stream.seek(cur_pos) diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 68c2536..8a8203d 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -1,37 +1,52 @@ -from typing import Any, Union +import io +from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup -from ._base import DocumentConverter, DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from ._markdownify import _CustomMarkdownify +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/html", + "application/xhtml", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".html", + ".htm", +] + class HtmlConverter(DocumentConverter): """Anything with content type text/html""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not html - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".html", ".htm"]: - return None - - result = None - with open(local_path, "rt", encoding="utf-8") as fh: - result = self._convert(fh.read()) - - return result - - def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: - """Helper function that converts an HTML string.""" - - # Parse the string - soup = BeautifulSoup(html_content, "html.parser") + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Parse the stream + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Remove javascript and style blocks for script in soup(["script", "style"]): @@ -51,6 +66,25 @@ class HtmlConverter(DocumentConverter): webpage_text = webpage_text.strip() return DocumentConverterResult( + markdown=webpage_text, title=None if soup.title is None else soup.title.string, - text_content=webpage_text, + ) + + def convert_string( + self, html_content: str, *, url: Optional[str] = None, **kwargs + ) -> DocumentConverterResult: + """ + Non-standard convenience method to convert a string to markdown. + Given that many converters produce HTML as intermediate output, this + allows for easy conversion of HTML to markdown. + """ + return self.convert( + file_stream=io.BytesIO(html_content.encode("utf-8")), + stream_info=StreamInfo( + mimetype="text/html", + extension=".html", + charset="utf-8", + url=url, + ), + **kwargs, ) diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index 4eb6155..dd8fbac 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -1,30 +1,53 @@ -from typing import Union -from ._base import DocumentConverter, DocumentConverterResult -from ._media_converter import MediaConverter +from typing import BinaryIO, Any, Union import base64 import mimetypes +from ._exiftool import exiftool_metadata +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "image/jpeg", + "image/png", +] + +ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"] -class ImageConverter(MediaConverter): +class ImageConverter(DocumentConverter): """ Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured). """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not an image - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".jpg", ".jpeg", ".png"]: - return None + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: md_content = "" # Add metadata - metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) + metadata = exiftool_metadata( + file_stream, exiftool_path=kwargs.get("exiftool_path") + ) if metadata: for f in [ @@ -42,39 +65,59 @@ class ImageConverter(MediaConverter): if f in metadata: md_content += f"{f}: {metadata[f]}\n" - # Try describing the image with GPTV + # Try describing the image with GPT llm_client = kwargs.get("llm_client") llm_model = kwargs.get("llm_model") if llm_client is not None and llm_model is not None: - md_content += ( - "\n# Description:\n" - + self._get_llm_description( - local_path, - extension, - llm_client, - llm_model, - prompt=kwargs.get("llm_prompt"), - ).strip() - + "\n" + llm_description = self._get_llm_description( + file_stream, + stream_info, + client=llm_client, + model=llm_model, + prompt=kwargs.get("llm_prompt"), ) + if llm_description is not None: + md_content += "\n# Description:\n" + llm_description.strip() + "\n" + return DocumentConverterResult( - title=None, - text_content=md_content, + markdown=md_content, ) - def _get_llm_description(self, local_path, extension, client, model, prompt=None): + def _get_llm_description( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + *, + client, + model, + prompt=None, + ) -> Union[None, str]: if prompt is None or prompt.strip() == "": prompt = "Write a detailed caption for this image." - data_uri = "" - with open(local_path, "rb") as image_file: - content_type, encoding = mimetypes.guess_type("_dummy" + extension) - if content_type is None: - content_type = "image/jpeg" - image_base64 = base64.b64encode(image_file.read()).decode("utf-8") - data_uri = f"data:{content_type};base64,{image_base64}" + # Get the content type + content_type = stream_info.mimetype + if not content_type: + content_type, _ = mimetypes.guess_type( + "_dummy" + (stream_info.extension or "") + ) + if not content_type: + content_type = "application/octet-stream" + # Convert to base64 + cur_pos = file_stream.tell() + try: + base64_image = base64.b64encode(file_stream.read()).decode("utf-8") + except Exception as e: + return None + finally: + file_stream.seek(cur_pos) + + # Prepare the data-uri + data_uri = f"data:{content_type};base64,{base64_image}" + + # Prepare the OpenAI API request messages = [ { "role": "user", @@ -90,5 +133,6 @@ class ImageConverter(MediaConverter): } ] + # Call the OpenAI API response = client.chat.completions.create(model=model, messages=messages) return response.choices[0].message.content diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py index b487f41..f8ba193 100644 --- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py +++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py @@ -1,39 +1,62 @@ +from typing import BinaryIO, Any import json -from typing import Any, Union - -from ._base import ( - DocumentConverter, - DocumentConverterResult, -) +from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import FileConversionException +from .._stream_info import StreamInfo + +CANDIDATE_MIME_TYPE_PREFIXES = [ + "application/json", +] + +ACCEPTED_FILE_EXTENSIONS = [".ipynb"] class IpynbConverter(DocumentConverter): """Converts Jupyter Notebook (.ipynb) files to Markdown.""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in CANDIDATE_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + # Read further to see if it's a notebook + cur_pos = file_stream.tell() + try: + encoding = stream_info.charset or "utf-8" + notebook_content = file_stream.read().decode(encoding) + return ( + "nbformat" in notebook_content + and "nbformat_minor" in notebook_content + ) + finally: + file_stream.seek(cur_pos) + + return False def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not ipynb - extension = kwargs.get("file_extension", "") - if extension.lower() != ".ipynb": - return None - + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Parse and convert the notebook result = None - with open(local_path, "rt", encoding="utf-8") as fh: - notebook_content = json.load(fh) - result = self._convert(notebook_content) - return result + encoding = stream_info.charset or "utf-8" + notebook_content = file_stream.read().decode(encoding=encoding) + return self._convert(json.loads(notebook_content)) - def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]: + def _convert(self, notebook_content: dict) -> DocumentConverterResult: """Helper function that converts notebook JSON content to Markdown.""" try: md_output = [] @@ -65,8 +88,8 @@ class IpynbConverter(DocumentConverter): title = notebook_content.get("metadata", {}).get("title", title) return DocumentConverterResult( + markdown=md_text, title=title, - text_content=md_text, ) except Exception as e: diff --git a/packages/markitdown/src/markitdown/converters/_llm_caption.py b/packages/markitdown/src/markitdown/converters/_llm_caption.py new file mode 100644 index 0000000..b851dc8 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_llm_caption.py @@ -0,0 +1,50 @@ +from typing import BinaryIO, Any, Union +import base64 +import mimetypes +from .._stream_info import StreamInfo + + +def llm_caption( + file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None +) -> Union[None, str]: + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed caption for this image." + + # Get the content type + content_type = stream_info.mimetype + if not content_type: + content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or "")) + if not content_type: + content_type = "application/octet-stream" + + # Convert to base64 + cur_pos = file_stream.tell() + try: + base64_image = base64.b64encode(file_stream.read()).decode("utf-8") + except Exception as e: + return None + finally: + file_stream.seek(cur_pos) + + # Prepare the data-uri + data_uri = f"data:{content_type};base64,{base64_image}" + + # Prepare the OpenAI API request + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + ], + } + ] + + # Call the OpenAI API + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index e15f607..ae99c0b 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -1,7 +1,7 @@ import re import markdownify -from typing import Any +from typing import Any, Optional from urllib.parse import quote, unquote, urlparse, urlunparse @@ -20,7 +20,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): # Explicitly cast options to the expected type if necessary super().__init__(**options) - def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: + def convert_hn( + self, + n: int, + el: Any, + text: str, + convert_as_inline: Optional[bool] = False, + **kwargs, + ) -> str: """Same as usual, but be sure to start with a new line""" if not convert_as_inline: if not re.search(r"^\n", text): @@ -28,7 +35,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): return super().convert_hn(n, el, text, convert_as_inline) # type: ignore - def convert_a(self, el: Any, text: str, convert_as_inline: bool): + def convert_a( + self, + el: Any, + text: str, + convert_as_inline: Optional[bool] = False, + **kwargs, + ): """Same as usual converter, but removes Javascript links and escapes URIs.""" prefix, suffix, text = markdownify.chomp(text) # type: ignore if not text: @@ -68,7 +81,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): else text ) - def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str: + def convert_img( + self, + el: Any, + text: str, + convert_as_inline: Optional[bool] = False, + **kwargs, + ) -> str: """Same as usual converter, but removes data URIs""" alt = el.attrs.get("alt", None) or "" diff --git a/packages/markitdown/src/markitdown/converters/_media_converter.py b/packages/markitdown/src/markitdown/converters/_media_converter.py deleted file mode 100644 index 5c7d82b..0000000 --- a/packages/markitdown/src/markitdown/converters/_media_converter.py +++ /dev/null @@ -1,41 +0,0 @@ -import subprocess -import shutil -import json -from warnings import warn - -from ._base import DocumentConverter - - -class MediaConverter(DocumentConverter): - """ - Abstract class for multi-modal media (e.g., images and audio) - """ - - def __init__( - self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT - ): - super().__init__(priority=priority) - - def _get_metadata(self, local_path, exiftool_path=None): - if not exiftool_path: - which_exiftool = shutil.which("exiftool") - if which_exiftool: - warn( - f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., - - md = MarkItDown(exiftool_path="{which_exiftool}") - -This warning will be removed in future releases. -""", - DeprecationWarning, - ) - - return None - else: - if True: - result = subprocess.run( - [exiftool_path, "-json", local_path], capture_output=True, text=True - ).stdout - return json.loads(result)[0] - # except Exception: - # return None diff --git a/packages/markitdown/src/markitdown/converters/_mp3_converter.py b/packages/markitdown/src/markitdown/converters/_mp3_converter.py deleted file mode 100644 index 91fd270..0000000 --- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py +++ /dev/null @@ -1,89 +0,0 @@ -import tempfile -from typing import Union -from ._base import DocumentConverter, DocumentConverterResult -from ._wav_converter import WavConverter -from warnings import resetwarnings, catch_warnings - -# Optional Transcription support -IS_AUDIO_TRANSCRIPTION_CAPABLE = False -try: - # Using warnings' catch_warnings to catch - # pydub's warning of ffmpeg or avconv missing - with catch_warnings(record=True) as w: - import pydub - - if w: - raise ModuleNotFoundError - import speech_recognition as sr - - IS_AUDIO_TRANSCRIPTION_CAPABLE = True -except ModuleNotFoundError: - pass -finally: - resetwarnings() - - -class Mp3Converter(WavConverter): - """ - Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed). - """ - - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a MP3 - extension = kwargs.get("file_extension", "") - if extension.lower() != ".mp3": - return None - - md_content = "" - - # Add metadata - metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) - if metadata: - for f in [ - "Title", - "Artist", - "Author", - "Band", - "Album", - "Genre", - "Track", - "DateTimeOriginal", - "CreateDate", - "Duration", - ]: - if f in metadata: - md_content += f"{f}: {metadata[f]}\n" - - # Transcribe - if IS_AUDIO_TRANSCRIPTION_CAPABLE: - handle, temp_path = tempfile.mkstemp(suffix=".wav") - os.close(handle) - try: - sound = pydub.AudioSegment.from_mp3(local_path) - sound.export(temp_path, format="wav") - - _args = dict() - _args.update(kwargs) - _args["file_extension"] = ".wav" - - try: - transcript = super()._transcribe_audio(temp_path).strip() - md_content += "\n\n### Audio Transcript:\n" + ( - "[No speech detected]" if transcript == "" else transcript - ) - except Exception: - md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio." - - finally: - os.unlink(temp_path) - - # Return the result - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index eb7a065..8a61b0c 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -1,6 +1,7 @@ import sys -from typing import Any, Union -from ._base import DocumentConverter, DocumentConverterResult +from typing import Any, Union, BinaryIO +from .._stream_info import StreamInfo +from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies @@ -12,6 +13,12 @@ except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/vnd.ms-outlook", +] + +ACCEPTED_FILE_EXTENSIONS = [".msg"] + class OutlookMsgConverter(DocumentConverter): """Converts Outlook .msg files to markdown by extracting email metadata and content. @@ -21,19 +28,52 @@ class OutlookMsgConverter(DocumentConverter): - Email body content """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + # Check the extension and mimetype + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + # Brute force, check if we have an OLE file + cur_pos = file_stream.tell() + try: + if not olefile.isOleFile(file_stream): + return False + finally: + file_stream.seek(cur_pos) + + # Brue force, check if it's an Outlook file + try: + msg = olefile.OleFileIO(file_stream) + toc = "\n".join([str(stream) for stream in msg.listdir()]) + return ( + "__properties_version1.0" in toc + and "__recip_version1.0_#00000000" in toc + ) + except Exception as e: + pass + finally: + file_stream.seek(cur_pos) + + return False def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not a MSG file - extension = kwargs.get("file_extension", "") - if extension.lower() != ".msg": - return None - + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Check: the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( @@ -42,44 +82,41 @@ class OutlookMsgConverter(DocumentConverter): extension=".msg", feature="outlook", ) - ) from _dependency_exc_info[1].with_traceback( + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] - ) # Restore the original traceback - - try: - msg = olefile.OleFileIO(local_path) - # Extract email metadata - md_content = "# Email Message\n\n" - - # Get headers - headers = { - "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), - "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), - "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), - } - - # Add headers to markdown - for key, value in headers.items(): - if value: - md_content += f"**{key}:** {value}\n" - - md_content += "\n## Content\n\n" - - # Get email body - body = self._get_stream_data(msg, "__substg1.0_1000001F") - if body: - md_content += body - - msg.close() - - return DocumentConverterResult( - title=headers.get("Subject"), text_content=md_content.strip() ) - except Exception as e: - raise FileConversionException( - f"Could not convert MSG file '{local_path}': {str(e)}" - ) + msg = olefile.OleFileIO(file_stream) + # Extract email metadata + md_content = "# Email Message\n\n" + + # Get headers + headers = { + "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), + "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), + "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), + } + + # Add headers to markdown + for key, value in headers.items(): + if value: + md_content += f"**{key}:** {value}\n" + + md_content += "\n## Content\n\n" + + # Get email body + body = self._get_stream_data(msg, "__substg1.0_1000001F") + if body: + md_content += body + + msg.close() + + return DocumentConverterResult( + markdown=md_content.strip(), + title=headers.get("Subject"), + ) def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]: """Helper to safely extract and decode stream data from the MSG file.""" diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 3c5ecad..4586ef1 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,8 +1,15 @@ import sys -from typing import Union -from ._base import DocumentConverter, DocumentConverterResult +import io + +from typing import BinaryIO, Any + + +from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _dependency_exc_info = None @@ -14,22 +21,43 @@ except ImportError: _dependency_exc_info = sys.exc_info() +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/pdf", + "application/x-pdf", +] + +ACCEPTED_FILE_EXTENSIONS = [".pdf"] + + class PdfConverter(DocumentConverter): """ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a PDF - extension = kwargs.get("file_extension", "") - if extension.lower() != ".pdf": - return None + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Check the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( @@ -38,11 +66,13 @@ class PdfConverter(DocumentConverter): extension=".pdf", feature="pdf", ) - ) from _dependency_exc_info[1].with_traceback( + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] - ) # Restore the original traceback + ) + assert isinstance(file_stream, io.IOBase) # for mypy return DocumentConverterResult( - title=None, - text_content=pdfminer.high_level.extract_text(local_path), + markdown=pdfminer.high_level.extract_text(file_stream), ) diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index b4c9282..4a21d3a 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -1,13 +1,26 @@ -import mimetypes +import sys -from charset_normalizer import from_path -from typing import Any, Union +from typing import BinaryIO, Any +from charset_normalizer import from_bytes +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo -from ._base import DocumentConverter, DocumentConverterResult +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_dependency_exc_info = None +try: + import mammoth +except ImportError: + # Preserve the error and stack trace for later + _dependency_exc_info = sys.exc_info() +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/", + "application/json", +] # Mimetypes to ignore (commonly confused extensions) -IGNORE_MIMETYPES = [ +IGNORE_MIME_TYPE_PREFIXES = [ "text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc. "text/vnd.graphviz", # .dot which is confused with xls, doc, etc. ] @@ -16,34 +29,34 @@ IGNORE_MIMETYPES = [ class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + for prefix in IGNORE_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return False + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Guess the content type from any file extension that might be around - content_type, _ = mimetypes.guess_type( - "__placeholder" + kwargs.get("file_extension", "") - ) + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + if stream_info.charset: + text_content = file_stream.read().decode(stream_info.charset) + else: + text_content = str(from_bytes(file_stream.read()).best()) - # Ignore common false positives - if content_type in IGNORE_MIMETYPES: - content_type = None - - # Only accept text files - if content_type is None: - return None - elif all( - not content_type.lower().startswith(type_prefix) - for type_prefix in ["text/", "application/json"] - ): - return None - - text_content = str(from_path(local_path).best()) - return DocumentConverterResult( - title=None, - text_content=text_content, - ) + return DocumentConverterResult(markdown=text_content) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 431b6a0..bea1226 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -1,12 +1,16 @@ +import sys import base64 +import os +import io import re import html -import sys -from typing import Union +from typing import BinaryIO, Any -from ._base import DocumentConverterResult, DocumentConverter from ._html_converter import HtmlConverter +from ._llm_caption import llm_caption +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies @@ -19,51 +23,46 @@ except ImportError: _dependency_exc_info = sys.exc_info() -class PptxConverter(HtmlConverter): +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/vnd.openxmlformats-officedocument.presentationml", +] + +ACCEPTED_FILE_EXTENSIONS = [".pptx"] + + +class PptxConverter(DocumentConverter): """ Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def __init__(self): + super().__init__() + self._html_converter = HtmlConverter() - def _get_llm_description( - self, llm_client, llm_model, image_blob, content_type, prompt=None - ): - if prompt is None or prompt.strip() == "": - prompt = "Write a detailed alt text for this image with less than 50 words." + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() - image_base64 = base64.b64encode(image_blob).decode("utf-8") - data_uri = f"data:{content_type};base64,{image_base64}" + if extension in ACCEPTED_FILE_EXTENSIONS: + return True - messages = [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": data_uri, - }, - }, - {"type": "text", "text": prompt}, - ], - } - ] + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True - response = llm_client.chat.completions.create( - model=llm_model, messages=messages - ) - return response.choices[0].message.content - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a PPTX - extension = kwargs.get("file_extension", "") - if extension.lower() != ".pptx": - return None + return False + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Check the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( @@ -72,11 +71,14 @@ class PptxConverter(HtmlConverter): extension=".pptx", feature="pptx", ) - ) from _dependency_exc_info[1].with_traceback( + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] - ) # Restore the original traceback + ) - presentation = pptx.Presentation(local_path) + # Perform the conversion + presentation = pptx.Presentation(file_stream) md_content = "" slide_num = 0 for slide in presentation.slides: @@ -92,59 +94,58 @@ class PptxConverter(HtmlConverter): if self._is_picture(shape): # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 - llm_description = None - alt_text = None + llm_description = "" + alt_text = "" + # Potentially generate a description using an LLM llm_client = kwargs.get("llm_client") llm_model = kwargs.get("llm_model") if llm_client is not None and llm_model is not None: + # Prepare a file_stream and stream_info for the image data + image_filename = shape.image.filename + image_extension = None + if image_filename: + image_extension = os.path.splitext(image_filename)[1] + image_stream_info = StreamInfo( + mimetype=shape.image.content_type, + extension=image_extension, + filename=image_filename, + ) + + image_stream = io.BytesIO(shape.image.blob) + + # Caption the image try: - llm_description = self._get_llm_description( - llm_client, - llm_model, - shape.image.blob, - shape.image.content_type, + llm_description = llm_caption( + image_stream, + image_stream_info, + client=llm_client, + model=llm_model, + prompt=kwargs.get("llm_prompt"), ) except Exception: - # Unable to describe with LLM + # Unable to generate a description pass - if not llm_description: - try: - alt_text = shape._element._nvXxPr.cNvPr.attrib.get( - "descr", "" - ) - except Exception: - # Unable to get alt text - pass + # Also grab any description embedded in the deck + try: + alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") + except Exception: + # Unable to get alt text + pass + + # Prepare the alt, escaping any special characters + alt_text = "\n".join([llm_description, alt_text]) or shape.name + alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) + alt_text = re.sub(r"\s+", " ", alt_text).strip() # A placeholder name filename = re.sub(r"\W", "", shape.name) + ".jpg" - md_content += ( - "\n![" - + (llm_description or alt_text or shape.name) - + "](" - + filename - + ")\n" - ) + md_content += "\n![" + alt_text + "](" + filename + ")\n" # Tables if self._is_table(shape): - html_table = "" - first_row = True - for row in shape.table.rows: - html_table += "" - for cell in row.cells: - if first_row: - html_table += "" - else: - html_table += "" - html_table += "" - first_row = False - html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" - md_content += ( - "\n" + self._convert(html_table).text_content.strip() + "\n" - ) + md_content += self._convert_table_to_markdown(shape.table) # Charts if shape.has_chart: @@ -174,10 +175,7 @@ class PptxConverter(HtmlConverter): md_content += notes_frame.text md_content = md_content.strip() - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) + return DocumentConverterResult(markdown=md_content.strip()) def _is_picture(self, shape): if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: @@ -192,6 +190,23 @@ class PptxConverter(HtmlConverter): return True return False + def _convert_table_to_markdown(self, table): + # Write the table as HTML, then convert it to Markdown + html_table = "" + first_row = True + for row in table.rows: + html_table += "" + for cell in row.cells: + if first_row: + html_table += "" + else: + html_table += "" + html_table += "" + first_row = False + html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" + + return self._html_converter.convert_string(html_table).markdown.strip() + "\n" + def _convert_chart_to_markdown(self, chart): md = "\n\n### Chart" if chart.has_title: diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index b279c85..dbafc1b 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -1,128 +1,165 @@ from xml.dom import minidom -from typing import Union +from typing import BinaryIO, Any, Union from bs4 import BeautifulSoup from ._markdownify import _CustomMarkdownify -from ._base import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from .._base_converter import DocumentConverter, DocumentConverterResult + +PRECISE_MIME_TYPE_PREFIXES = [ + "application/rss", + "application/atom", +] + +PRECISE_FILE_EXTENSIONS = [".rss", ".atom"] + +CANDIDATE_MIME_TYPE_PREFIXES = [ + "text/xml", + "application/xml", +] + +CANDIDATE_FILE_EXTENSIONS = [ + ".xml", +] class RssConverter(DocumentConverter): """Convert RSS / Atom type to markdown""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() - def convert( - self, local_path: str, **kwargs - ) -> Union[None, DocumentConverterResult]: - # Bail if not RSS type - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".xml", ".rss", ".atom"]: - return None + # Check for precise mimetypes and file extensions + if extension in PRECISE_FILE_EXTENSIONS: + return True + + for prefix in PRECISE_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + # Check for precise mimetypes and file extensions + if extension in CANDIDATE_FILE_EXTENSIONS: + return self._check_xml(file_stream) + + for prefix in CANDIDATE_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return self._check_xml(file_stream) + + return False + + def _check_xml(self, file_stream: BinaryIO) -> bool: + cur_pos = file_stream.tell() try: - doc = minidom.parse(local_path) + doc = minidom.parse(file_stream) + return self._feed_type(doc) is not None except BaseException as _: - return None - result = None + pass + finally: + file_stream.seek(cur_pos) + return False + + def _feed_type(self, doc: Any) -> str: if doc.getElementsByTagName("rss"): - # A RSS feed must have a root element of - result = self._parse_rss_type(doc) + return "rss" elif doc.getElementsByTagName("feed"): root = doc.getElementsByTagName("feed")[0] if root.getElementsByTagName("entry"): # An Atom feed must have a root element of and at least one - result = self._parse_atom_type(doc) - else: - return None + return "atom" + return None + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + doc = minidom.parse(file_stream) + feed_type = self._feed_type(doc) + + if feed_type == "rss": + return self._parse_rss_type(doc) + elif feed_type == "atom": + return self._parse_atom_type(doc) else: - # not rss or atom - return None + raise ValueError("Unknown feed type") - return result - - def _parse_atom_type( - self, doc: minidom.Document - ) -> Union[None, DocumentConverterResult]: + def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult: """Parse the type of an Atom feed. Returns None if the feed type is not recognized or something goes wrong. """ - try: - root = doc.getElementsByTagName("feed")[0] - title = self._get_data_by_tag_name(root, "title") - subtitle = self._get_data_by_tag_name(root, "subtitle") - entries = root.getElementsByTagName("entry") - md_text = f"# {title}\n" - if subtitle: - md_text += f"{subtitle}\n" - for entry in entries: - entry_title = self._get_data_by_tag_name(entry, "title") - entry_summary = self._get_data_by_tag_name(entry, "summary") - entry_updated = self._get_data_by_tag_name(entry, "updated") - entry_content = self._get_data_by_tag_name(entry, "content") + root = doc.getElementsByTagName("feed")[0] + title = self._get_data_by_tag_name(root, "title") + subtitle = self._get_data_by_tag_name(root, "subtitle") + entries = root.getElementsByTagName("entry") + md_text = f"# {title}\n" + if subtitle: + md_text += f"{subtitle}\n" + for entry in entries: + entry_title = self._get_data_by_tag_name(entry, "title") + entry_summary = self._get_data_by_tag_name(entry, "summary") + entry_updated = self._get_data_by_tag_name(entry, "updated") + entry_content = self._get_data_by_tag_name(entry, "content") - if entry_title: - md_text += f"\n## {entry_title}\n" - if entry_updated: - md_text += f"Updated on: {entry_updated}\n" - if entry_summary: - md_text += self._parse_content(entry_summary) - if entry_content: - md_text += self._parse_content(entry_content) + if entry_title: + md_text += f"\n## {entry_title}\n" + if entry_updated: + md_text += f"Updated on: {entry_updated}\n" + if entry_summary: + md_text += self._parse_content(entry_summary) + if entry_content: + md_text += self._parse_content(entry_content) - return DocumentConverterResult( - title=title, - text_content=md_text, - ) - except BaseException as _: - return None + return DocumentConverterResult( + markdown=md_text, + title=title, + ) - def _parse_rss_type( - self, doc: minidom.Document - ) -> Union[None, DocumentConverterResult]: + def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult: """Parse the type of an RSS feed. Returns None if the feed type is not recognized or something goes wrong. """ - try: - root = doc.getElementsByTagName("rss")[0] - channel = root.getElementsByTagName("channel") - if not channel: - return None - channel = channel[0] - channel_title = self._get_data_by_tag_name(channel, "title") - channel_description = self._get_data_by_tag_name(channel, "description") - items = channel.getElementsByTagName("item") - if channel_title: - md_text = f"# {channel_title}\n" - if channel_description: - md_text += f"{channel_description}\n" - if not items: - items = [] - for item in items: - title = self._get_data_by_tag_name(item, "title") - description = self._get_data_by_tag_name(item, "description") - pubDate = self._get_data_by_tag_name(item, "pubDate") - content = self._get_data_by_tag_name(item, "content:encoded") - - if title: - md_text += f"\n## {title}\n" - if pubDate: - md_text += f"Published on: {pubDate}\n" - if description: - md_text += self._parse_content(description) - if content: - md_text += self._parse_content(content) - - return DocumentConverterResult( - title=channel_title, - text_content=md_text, - ) - except BaseException as _: - print(traceback.format_exc()) + root = doc.getElementsByTagName("rss")[0] + channel = root.getElementsByTagName("channel") + if not channel: return None + channel = channel[0] + channel_title = self._get_data_by_tag_name(channel, "title") + channel_description = self._get_data_by_tag_name(channel, "description") + items = channel.getElementsByTagName("item") + if channel_title: + md_text = f"# {channel_title}\n" + if channel_description: + md_text += f"{channel_description}\n" + if not items: + items = [] + for item in items: + title = self._get_data_by_tag_name(item, "title") + description = self._get_data_by_tag_name(item, "description") + pubDate = self._get_data_by_tag_name(item, "pubDate") + content = self._get_data_by_tag_name(item, "content:encoded") + + if title: + md_text += f"\n## {title}\n" + if pubDate: + md_text += f"Published on: {pubDate}\n" + if description: + md_text += self._parse_content(description) + if content: + md_text += self._parse_content(content) + + return DocumentConverterResult( + markdown=md_text, + title=channel_title, + ) def _parse_content(self, content: str) -> str: """Parse the content of an RSS feed item""" diff --git a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py new file mode 100644 index 0000000..3d02173 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py @@ -0,0 +1,43 @@ +import io +import sys +from typing import BinaryIO +from .._exceptions import MissingDependencyException + +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_dependency_exc_info = None +try: + import speech_recognition as sr + import pydub +except ImportError: + # Preserve the error and stack trace for later + _dependency_exc_info = sys.exc_info() + + +def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str: + # Check for installed dependencies + if _dependency_exc_info is not None: + raise MissingDependencyException( + "Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`" + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] + _dependency_exc_info[2] + ) + + if audio_format in ["wav", "aiff", "flac"]: + audio_source = file_stream + elif audio_format in ["mp3", "mp4"]: + audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format) + + audio_source = io.BytesIO() + audio_segment.export(audio_source, format="wav") + audio_source.seek(0) + else: + raise ValueError(f"Unsupported audio format: {audio_format}") + + recognizer = sr.Recognizer() + with sr.AudioFile(audio_source) as source: + audio = recognizer.record(source) + transcript = recognizer.recognize_google(audio).strip() + return "[No speech detected]" if transcript == "" else transcript diff --git a/packages/markitdown/src/markitdown/converters/_wav_converter.py b/packages/markitdown/src/markitdown/converters/_wav_converter.py deleted file mode 100644 index 3c8d842..0000000 --- a/packages/markitdown/src/markitdown/converters/_wav_converter.py +++ /dev/null @@ -1,72 +0,0 @@ -from typing import Union -from ._base import DocumentConverter, DocumentConverterResult -from ._media_converter import MediaConverter - -# Optional Transcription support -IS_AUDIO_TRANSCRIPTION_CAPABLE = False -try: - import speech_recognition as sr - - IS_AUDIO_TRANSCRIPTION_CAPABLE = True -except ModuleNotFoundError: - pass - - -class WavConverter(MediaConverter): - """ - Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). - """ - - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a WAV - extension = kwargs.get("file_extension", "") - if extension.lower() != ".wav": - return None - - md_content = "" - - # Add metadata - metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) - if metadata: - for f in [ - "Title", - "Artist", - "Author", - "Band", - "Album", - "Genre", - "Track", - "DateTimeOriginal", - "CreateDate", - "Duration", - ]: - if f in metadata: - md_content += f"{f}: {metadata[f]}\n" - - # Transcribe - if IS_AUDIO_TRANSCRIPTION_CAPABLE: - try: - transcript = self._transcribe_audio(local_path) - md_content += "\n\n### Audio Transcript:\n" + ( - "[No speech detected]" if transcript == "" else transcript - ) - except Exception: - md_content += ( - "\n\n### Audio Transcript:\nError. Could not transcribe this audio." - ) - - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) - - def _transcribe_audio(self, local_path) -> str: - recognizer = sr.Recognizer() - with sr.AudioFile(local_path) as source: - audio = recognizer.record(source) - return recognizer.recognize_google(audio).strip() diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index f27fe23..5b054af 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -1,35 +1,63 @@ +import io import re - -from typing import Any, Union +from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup -from ._base import DocumentConverter, DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from ._markdownify import _CustomMarkdownify +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/html", + "application/xhtml", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".html", + ".htm", +] + class WikipediaConverter(DocumentConverter): """Handle Wikipedia pages separately, focusing only on the main document content.""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + """ + Make sure we're dealing with HTML content *from* Wikipedia. + """ + + url = stream_info.url or "" + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url): + # Not a Wikipedia URL + return False + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + # Not HTML content + return False def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not Wikipedia - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".html", ".htm"]: - return None - url = kwargs.get("url", "") - if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url): - return None - - # Parse the file - soup = None - with open(local_path, "rt", encoding="utf-8") as fh: - soup = BeautifulSoup(fh.read(), "html.parser") + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Parse the stream + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Remove javascript and style blocks for script in soup(["script", "style"]): @@ -56,6 +84,6 @@ class WikipediaConverter(DocumentConverter): webpage_text = _CustomMarkdownify().convert_soup(soup) return DocumentConverterResult( + markdown=webpage_text, title=main_title, - text_content=webpage_text, ) diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 56398ca..3d0e1ab 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -1,10 +1,9 @@ import sys - -from typing import Union - -from ._base import DocumentConverter, DocumentConverterResult +from typing import BinaryIO, Any from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from .._stream_info import StreamInfo # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later @@ -22,23 +21,51 @@ try: except ImportError: _xls_dependency_exc_info = sys.exc_info() +ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [ + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" +] +ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"] -class XlsxConverter(HtmlConverter): +ACCEPTED_XLS_MIME_TYPE_PREFIXES = [ + "application/vnd.ms-excel", + "application/excel", +] +ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] + + +class XlsxConverter(DocumentConverter): """ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def __init__(self): + super().__init__() + self._html_converter = HtmlConverter() - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a XLSX - extension = kwargs.get("file_extension", "") - if extension.lower() != ".xlsx": - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + if extension in ACCEPTED_XLSX_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Check the dependencies if _xlsx_dependency_exc_info is not None: raise MissingDependencyException( @@ -47,34 +74,58 @@ class XlsxConverter(HtmlConverter): extension=".xlsx", feature="xlsx", ) - ) from _xlsx_dependency_exc_info[1].with_traceback( + ) from _xlsx_dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _xlsx_dependency_exc_info[2] - ) # Restore the original traceback + ) - sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") + sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") md_content = "" for s in sheets: md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) - md_content += self._convert(html_content).text_content.strip() + "\n\n" + md_content += ( + self._html_converter.convert_string(html_content).markdown.strip() + + "\n\n" + ) - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) + return DocumentConverterResult(markdown=md_content.strip()) -class XlsConverter(HtmlConverter): +class XlsConverter(DocumentConverter): """ Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. """ - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a XLS - extension = kwargs.get("file_extension", "") - if extension.lower() != ".xls": - return None + def __init__(self): + super().__init__() + self._html_converter = HtmlConverter() + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_XLS_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Load the dependencies if _xls_dependency_exc_info is not None: raise MissingDependencyException( @@ -83,18 +134,20 @@ class XlsConverter(HtmlConverter): extension=".xls", feature="xls", ) - ) from _xls_dependency_exc_info[1].with_traceback( + ) from _xls_dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _xls_dependency_exc_info[2] - ) # Restore the original traceback + ) - sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") + sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") md_content = "" for s in sheets: md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) - md_content += self._convert(html_content).text_content.strip() + "\n\n" + md_content += ( + self._html_converter.convert_string(html_content).markdown.strip() + + "\n\n" + ) - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) + return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index e61b208..5a158d5 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -1,14 +1,15 @@ -import re +import sys import json -import urllib.parse import time - -from typing import Any, Union, Dict, List -from urllib.parse import parse_qs, urlparse +import io +import re +from typing import Any, BinaryIO, Optional, Dict, List, Union +from urllib.parse import parse_qs, urlparse, unquote from bs4 import BeautifulSoup -from ._base import DocumentConverter, DocumentConverterResult - +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from ._markdownify import _CustomMarkdownify # Optional YouTube transcription support try: @@ -19,53 +20,59 @@ except ModuleNotFoundError: IS_YOUTUBE_TRANSCRIPT_CAPABLE = False +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/html", + "application/xhtml", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".html", + ".htm", +] + + class YouTubeConverter(DocumentConverter): """Handle YouTube specially, focusing on the video title, description, and transcript.""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + """ + Make sure we're dealing with HTML content *from* YouTube. + """ + url = stream_info.url or "" + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() - def retry_operation(self, operation, retries=3, delay=2): - """Retries the operation if it fails.""" - attempt = 0 - while attempt < retries: - try: - return operation() # Attempt the operation - except Exception as e: - print(f"Attempt {attempt + 1} failed: {e}") - if attempt < retries - 1: - time.sleep(delay) # Wait before retrying - attempt += 1 - # If all attempts fail, raise the last exception - raise Exception(f"Operation failed after {retries} attempts.") - - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not YouTube - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".html", ".htm"]: - return None - url = kwargs.get("url", "") - - url = urllib.parse.unquote(url) + url = unquote(url) url = url.replace(r"\?", "?").replace(r"\=", "=") if not url.startswith("https://www.youtube.com/watch?"): - return None + # Not a YouTube URL + return False - # Parse the file with error handling - try: - with open(local_path, "rt", encoding="utf-8") as fh: - soup = BeautifulSoup(fh.read(), "html.parser") - except Exception as e: - print(f"Error reading YouTube page: {e}") - return None + if extension in ACCEPTED_FILE_EXTENSIONS: + return True - if not soup.title or not soup.title.string: - return None + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + # Not HTML content + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Parse the stream + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Read the meta tags metadata: Dict[str, str] = {"title": soup.title.string} @@ -126,7 +133,7 @@ class YouTubeConverter(DocumentConverter): if IS_YOUTUBE_TRANSCRIPT_CAPABLE: transcript_text = "" - parsed_url = urlparse(url) # type: ignore + parsed_url = urlparse(stream_info.url) # type: ignore params = parse_qs(parsed_url.query) # type: ignore if "v" in params and params["v"][0]: video_id = str(params["v"][0]) @@ -135,7 +142,7 @@ class YouTubeConverter(DocumentConverter): "youtube_transcript_languages", ("en",) ) # Retry the transcript fetching operation - transcript = self.retry_operation( + transcript = self._retry_operation( lambda: YouTubeTranscriptApi.get_transcript( video_id, languages=youtube_transcript_languages ), @@ -158,8 +165,8 @@ class YouTubeConverter(DocumentConverter): assert isinstance(title, str) return DocumentConverterResult( + markdown=webpage_text, title=title, - text_content=webpage_text, ) def _get( @@ -188,3 +195,17 @@ class YouTubeConverter(DocumentConverter): if result := self._findKey(v, key): return result return None + + def _retry_operation(self, operation, retries=3, delay=2): + """Retries the operation if it fails.""" + attempt = 0 + while attempt < retries: + try: + return operation() # Attempt the operation + except Exception as e: + print(f"Attempt {attempt + 1} failed: {e}") + if attempt < retries - 1: + time.sleep(delay) # Wait before retrying + attempt += 1 + # If all attempts fail, raise the last exception + raise Exception(f"Operation failed after {retries} attempts.") diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index e2b5fe6..cb1a7e6 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -1,9 +1,23 @@ -import os +import sys import zipfile -import shutil -from typing import Any, Union +import io +import os -from ._base import DocumentConverter, DocumentConverterResult +from typing import BinaryIO, Any, TYPE_CHECKING + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from .._exceptions import UnsupportedFormatException, FileConversionException + +# Break otherwise circular import for type hinting +if TYPE_CHECKING: + from .._markitdown import MarkItDown + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/zip", +] + +ACCEPTED_FILE_EXTENSIONS = [".zip"] class ZipConverter(DocumentConverter): @@ -46,99 +60,58 @@ class ZipConverter(DocumentConverter): """ def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + self, + *, + markitdown: "MarkItDown", ): - super().__init__(priority=priority) + super().__init__() + self._markitdown = markitdown + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not a ZIP - extension = kwargs.get("file_extension", "") - if extension.lower() != ".zip": - return None + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + file_path = stream_info.url or stream_info.local_path or stream_info.filename + md_content = f"Content from the zip file `{file_path}`:\n\n" - # Get parent converters list if available - parent_converters = kwargs.get("_parent_converters", []) - if not parent_converters: - return DocumentConverterResult( - title=None, - text_content=f"[ERROR] No converters available to process zip contents from: {local_path}", - ) + with zipfile.ZipFile(file_stream, "r") as zipObj: + for name in zipObj.namelist(): + try: + z_file_stream = io.BytesIO(zipObj.read(name)) + z_file_stream_info = StreamInfo( + extension=os.path.splitext(name)[1], + filename=os.path.basename(name), + ) + result = self._markitdown.convert_stream( + stream=z_file_stream, + stream_info=z_file_stream_info, + ) + if result is not None: + md_content += f"## File: {name}\n\n" + md_content += result.markdown + "\n\n" + except UnsupportedFormatException: + pass + except FileConversionException: + pass - extracted_zip_folder_name = ( - f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}" - ) - extraction_dir = os.path.normpath( - os.path.join(os.path.dirname(local_path), extracted_zip_folder_name) - ) - md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n" - - try: - # Extract the zip file safely - with zipfile.ZipFile(local_path, "r") as zipObj: - # Bail if we discover it's an Office OOXML file - if "[Content_Types].xml" in zipObj.namelist(): - return None - - # Safeguard against path traversal - for member in zipObj.namelist(): - member_path = os.path.normpath(os.path.join(extraction_dir, member)) - if ( - not os.path.commonprefix([extraction_dir, member_path]) - == extraction_dir - ): - raise ValueError( - f"Path traversal detected in zip file: {member}" - ) - - # Extract all files safely - zipObj.extractall(path=extraction_dir) - - # Process each extracted file - for root, dirs, files in os.walk(extraction_dir): - for name in files: - file_path = os.path.join(root, name) - relative_path = os.path.relpath(file_path, extraction_dir) - - # Get file extension - _, file_extension = os.path.splitext(name) - - # Update kwargs for the file - file_kwargs = kwargs.copy() - file_kwargs["file_extension"] = file_extension - file_kwargs["_parent_converters"] = parent_converters - - # Try converting the file using available converters - for converter in parent_converters: - # Skip the zip converter to avoid infinite recursion - if isinstance(converter, ZipConverter): - continue - - result = converter.convert(file_path, **file_kwargs) - if result is not None: - md_content += f"\n## File: {relative_path}\n\n" - md_content += result.text_content + "\n\n" - break - - # Clean up extracted files if specified - if kwargs.get("cleanup_extracted", True): - shutil.rmtree(extraction_dir) - - return DocumentConverterResult(title=None, text_content=md_content.strip()) - - except zipfile.BadZipFile: - return DocumentConverterResult( - title=None, - text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}", - ) - except ValueError as ve: - return DocumentConverterResult( - title=None, - text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}", - ) - except Exception as e: - return DocumentConverterResult( - title=None, - text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", - ) + return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitdown/tests/test_cli.py b/packages/markitdown/tests/test_cli.py index 1e2b095..7c8afc2 100644 --- a/packages/markitdown/tests/test_cli.py +++ b/packages/markitdown/tests/test_cli.py @@ -7,7 +7,7 @@ from markitdown import __version__ try: from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS except ImportError: - from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS + from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS # type: ignore @pytest.fixture(scope="session") diff --git a/packages/markitdown/tests/test_files/test.m4a b/packages/markitdown/tests/test_files/test.m4a new file mode 100755 index 0000000..7a3b25f Binary files /dev/null and b/packages/markitdown/tests/test_files/test.m4a differ diff --git a/packages/markitdown/tests/test_files/test.mp3 b/packages/markitdown/tests/test_files/test.mp3 new file mode 100644 index 0000000..b13ff88 Binary files /dev/null and b/packages/markitdown/tests/test_files/test.mp3 differ diff --git a/packages/markitdown/tests/test_files/test.pdf b/packages/markitdown/tests/test_files/test.pdf new file mode 100644 index 0000000..e82861e Binary files /dev/null and b/packages/markitdown/tests/test_files/test.pdf differ diff --git a/packages/markitdown/tests/test_files/test.pptx b/packages/markitdown/tests/test_files/test.pptx index e6d16f3..fb66302 100644 Binary files a/packages/markitdown/tests/test_files/test.pptx and b/packages/markitdown/tests/test_files/test.pptx differ diff --git a/packages/markitdown/tests/test_files/test.wav b/packages/markitdown/tests/test_files/test.wav new file mode 100644 index 0000000..e872f10 Binary files /dev/null and b/packages/markitdown/tests/test_files/test.wav differ diff --git a/packages/markitdown/tests/test_files/test_notebook.ipynb b/packages/markitdown/tests/test_files/test_notebook.ipynb index 62db0fa..28a546f 100644 --- a/packages/markitdown/tests/test_files/test_notebook.ipynb +++ b/packages/markitdown/tests/test_files/test_notebook.ipynb @@ -1,89 +1,89 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "0f61db80", - "metadata": {}, - "source": [ - "# Test Notebook" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "3f2a5bbd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "markitdown\n" - ] - } - ], - "source": [ - "print('markitdown')" - ] - }, - { - "cell_type": "markdown", - "id": "9b9c0468", - "metadata": {}, - "source": [ - "## Code Cell Below" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "37d8088a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "42\n" - ] - } - ], - "source": [ - "# comment in code\n", - "print(42)" - ] - }, - { - "cell_type": "markdown", - "id": "2e3177bd", - "metadata": {}, - "source": [ - "End\n", - "\n", - "---" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - }, - "title": "Test Notebook Title" - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "0f61db80", + "metadata": {}, + "source": [ + "# Test Notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3f2a5bbd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "markitdown\n" + ] + } + ], + "source": [ + "print(\"markitdown\")" + ] + }, + { + "cell_type": "markdown", + "id": "9b9c0468", + "metadata": {}, + "source": [ + "## Code Cell Below" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "37d8088a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "42\n" + ] + } + ], + "source": [ + "# comment in code\n", + "print(42)" + ] + }, + { + "cell_type": "markdown", + "id": "2e3177bd", + "metadata": {}, + "source": [ + "End\n", + "\n", + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + }, + "title": "Test Notebook Title" + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py index 0a3b56e..8c34da0 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_markitdown.py @@ -2,13 +2,20 @@ import io import os import shutil +import openai import pytest import requests -from warnings import catch_warnings, resetwarnings +import warnings -from markitdown import MarkItDown, UnsupportedFormatException, FileConversionException +from markitdown import ( + MarkItDown, + UnsupportedFormatException, + FileConversionException, + StreamInfo, +) +from markitdown._stream_info import _guess_stream_info_from_stream skip_remote = ( True if os.environ.get("GITHUB_ACTIONS") else False @@ -35,6 +42,13 @@ JPG_TEST_EXIFTOOL = { "DateTimeOriginal": "2024:03:14 22:10:00", } +MP3_TEST_EXIFTOOL = { + "Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e", + "Artist": "Artist Name Test String", + "Album": "Album Name Test String", + "SampleRate": "48000", +} + PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf" PDF_TEST_STRINGS = [ "While there is contemporaneous exploration of multi-agent approaches" @@ -162,6 +176,107 @@ def validate_strings(result, expected_strings, exclude_strings=None): assert string not in text_content +def test_stream_info_operations() -> None: + """Test operations performed on StreamInfo objects.""" + + stream_info_original = StreamInfo( + mimetype="mimetype.1", + extension="extension.1", + charset="charset.1", + filename="filename.1", + local_path="local_path.1", + url="url.1", + ) + + # Check updating all attributes by keyword + keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] + for keyword in keywords: + updated_stream_info = stream_info_original.copy_and_update( + **{keyword: f"{keyword}.2"} + ) + + # Make sure the targted attribute is updated + assert getattr(updated_stream_info, keyword) == f"{keyword}.2" + + # Make sure the other attributes are unchanged + for k in keywords: + if k != keyword: + assert getattr(stream_info_original, k) == getattr( + updated_stream_info, k + ) + + # Check updating all attributes by passing a new StreamInfo object + keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] + for keyword in keywords: + updated_stream_info = stream_info_original.copy_and_update( + StreamInfo(**{keyword: f"{keyword}.2"}) + ) + + # Make sure the targted attribute is updated + assert getattr(updated_stream_info, keyword) == f"{keyword}.2" + + # Make sure the other attributes are unchanged + for k in keywords: + if k != keyword: + assert getattr(stream_info_original, k) == getattr( + updated_stream_info, k + ) + + # Check mixing and matching + updated_stream_info = stream_info_original.copy_and_update( + StreamInfo(extension="extension.2", filename="filename.2"), + mimetype="mimetype.3", + charset="charset.3", + ) + assert updated_stream_info.extension == "extension.2" + assert updated_stream_info.filename == "filename.2" + assert updated_stream_info.mimetype == "mimetype.3" + assert updated_stream_info.charset == "charset.3" + assert updated_stream_info.local_path == "local_path.1" + assert updated_stream_info.url == "url.1" + + # Check multiple StreamInfo objects + updated_stream_info = stream_info_original.copy_and_update( + StreamInfo(extension="extension.4", filename="filename.5"), + StreamInfo(mimetype="mimetype.6", charset="charset.7"), + ) + assert updated_stream_info.extension == "extension.4" + assert updated_stream_info.filename == "filename.5" + assert updated_stream_info.mimetype == "mimetype.6" + assert updated_stream_info.charset == "charset.7" + assert updated_stream_info.local_path == "local_path.1" + assert updated_stream_info.url == "url.1" + + +def test_stream_info_guesses() -> None: + """Test StreamInfo guesses based on stream content.""" + + test_tuples = [ + ( + os.path.join(TEST_FILES_DIR, "test.xlsx"), + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ( + os.path.join(TEST_FILES_DIR, "test.docx"), + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ), + ( + os.path.join(TEST_FILES_DIR, "test.pptx"), + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + (os.path.join(TEST_FILES_DIR, "test.xls"), "application/vnd.ms-excel"), + ] + + for file_path, expected_mimetype in test_tuples: + with open(file_path, "rb") as f: + guesses = _guess_stream_info_from_stream( + f, filename_hint=os.path.basename(file_path) + ) + assert len(guesses) > 0 + assert guesses[0].mimetype == expected_mimetype + assert guesses[0].extension == os.path.splitext(file_path)[1] + + @pytest.mark.skipif( skip_remote, reason="do not run tests that query external urls", @@ -183,7 +298,6 @@ def test_markitdown_remote() -> None: assert test_string in result.text_content # Youtube - # TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue result = markitdown.convert(YOUTUBE_TEST_URL) for test_string in YOUTUBE_TEST_STRINGS: assert test_string in result.text_content @@ -192,6 +306,10 @@ def test_markitdown_remote() -> None: def test_markitdown_local() -> None: markitdown = MarkItDown() + # Test PDF processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pdf")) + validate_strings(result, PDF_TEST_STRINGS) + # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) validate_strings(result, XLSX_TEST_STRINGS) @@ -230,10 +348,6 @@ def test_markitdown_local() -> None: ) validate_strings(result, BLOG_TEST_STRINGS) - # Test ZIP file processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) - validate_strings(result, XLSX_TEST_STRINGS) - # Test Wikipedia processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL @@ -254,24 +368,135 @@ def test_markitdown_local() -> None: for test_string in RSS_TEST_STRINGS: assert test_string in text_content - ## Test non-UTF-8 encoding - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) - validate_strings(result, CSV_CP932_TEST_STRINGS) - # Test MSG (Outlook email) processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) validate_strings(result, MSG_TEST_STRINGS) + # Test non-UTF-8 encoding + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) + validate_strings(result, CSV_CP932_TEST_STRINGS) + # Test JSON processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json")) validate_strings(result, JSON_TEST_STRINGS) + # # Test ZIP file processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) + validate_strings(result, DOCX_TEST_STRINGS) + validate_strings(result, XLSX_TEST_STRINGS) + validate_strings(result, BLOG_TEST_STRINGS) + + # Test input from a stream + input_data = b"

Test

" + result = markitdown.convert_stream(io.BytesIO(input_data)) + assert "# Test" in result.text_content + # Test input with leading blank characters input_data = b" \n\n\n

Test

" result = markitdown.convert_stream(io.BytesIO(input_data)) assert "# Test" in result.text_content +def test_markitdown_streams() -> None: + markitdown = MarkItDown() + + # Test PDF processing + with open(os.path.join(TEST_FILES_DIR, "test.pdf"), "rb") as f: + result = markitdown.convert(f, file_extension=".pdf") + validate_strings(result, PDF_TEST_STRINGS) + + # Test XLSX processing + with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f: + result = markitdown.convert(f, file_extension=".xlsx") + validate_strings(result, XLSX_TEST_STRINGS) + + # Test XLS processing + with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f: + result = markitdown.convert(f, file_extension=".xls") + for test_string in XLS_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + + # Test DOCX processing + with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f: + result = markitdown.convert(f, file_extension=".docx") + validate_strings(result, DOCX_TEST_STRINGS) + + # Test DOCX processing, with comments + with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f: + result = markitdown.convert( + f, + file_extension=".docx", + style_map="comment-reference => ", + ) + validate_strings(result, DOCX_COMMENT_TEST_STRINGS) + + # Test DOCX processing, with comments and setting style_map on init + markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") + with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f: + result = markitdown_with_style_map.convert(f, file_extension=".docx") + validate_strings(result, DOCX_COMMENT_TEST_STRINGS) + + # Test PPTX processing + with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f: + result = markitdown.convert(f, file_extension=".pptx") + validate_strings(result, PPTX_TEST_STRINGS) + + # Test HTML processing + with open(os.path.join(TEST_FILES_DIR, "test_blog.html"), "rb") as f: + result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL) + validate_strings(result, BLOG_TEST_STRINGS) + + # Test Wikipedia processing + with open(os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rb") as f: + result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL) + text_content = result.text_content.replace("\\", "") + validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES) + + # Test Bing processing + with open(os.path.join(TEST_FILES_DIR, "test_serp.html"), "rb") as f: + result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL) + text_content = result.text_content.replace("\\", "") + validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES) + + # Test RSS processing + with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f: + result = markitdown.convert(f, file_extension=".xml") + text_content = result.text_content.replace("\\", "") + for test_string in RSS_TEST_STRINGS: + assert test_string in text_content + + # Test MSG (Outlook email) processing + with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f: + result = markitdown.convert(f, file_extension=".msg") + validate_strings(result, MSG_TEST_STRINGS) + + # Test JSON processing + with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f: + result = markitdown.convert(f, file_extension=".json") + validate_strings(result, JSON_TEST_STRINGS) + + +@pytest.mark.skipif( + skip_remote, + reason="do not run remotely run speech transcription tests", +) +def test_speech_transcription() -> None: + markitdown = MarkItDown() + + # Test WAV files, MP3 and M4A files + for file_name in ["test.wav", "test.mp3", "test.m4a"]: + result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name)) + result_lower = result.text_content.lower() + assert ( + ("1" in result_lower or "one" in result_lower) + and ("2" in result_lower or "two" in result_lower) + and ("3" in result_lower or "three" in result_lower) + and ("4" in result_lower or "four" in result_lower) + and ("5" in result_lower or "five" in result_lower) + ) + + def test_exceptions() -> None: # Check that an exception is raised when trying to convert an unsupported format markitdown = MarkItDown() @@ -295,17 +520,20 @@ def test_markitdown_exiftool() -> None: # Test the automatic discovery of exiftool throws a warning # and is disabled try: - with catch_warnings(record=True) as w: + warnings.simplefilter("default") + with warnings.catch_warnings(record=True) as w: markitdown = MarkItDown() result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) assert len(w) == 1 assert w[0].category is DeprecationWarning assert result.text_content.strip() == "" finally: - resetwarnings() + warnings.resetwarnings() + + which_exiftool = shutil.which("exiftool") + assert which_exiftool is not None # Test explicitly setting the location of exiftool - which_exiftool = shutil.which("exiftool") markitdown = MarkItDown(exiftool_path=which_exiftool) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) for key in JPG_TEST_EXIFTOOL: @@ -320,6 +548,12 @@ def test_markitdown_exiftool() -> None: target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" assert target in result.text_content + # Test some other media types + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3")) + for key in MP3_TEST_EXIFTOOL: + target = f"{key}: {MP3_TEST_EXIFTOOL[key]}" + assert target in result.text_content + @pytest.mark.skipif( skip_llm, @@ -330,7 +564,6 @@ def test_markitdown_llm() -> None: markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o") result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg")) - for test_string in LLM_TEST_STRINGS: assert test_string in result.text_content @@ -339,12 +572,24 @@ def test_markitdown_llm() -> None: for test_string in ["red", "circle", "blue", "square"]: assert test_string in result.text_content.lower() + # Images embedded in PPTX files + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) + # LLM Captions are included + for test_string in LLM_TEST_STRINGS: + assert test_string in result.text_content + # Standard alt text is included + validate_strings(result, PPTX_TEST_STRINGS) + if __name__ == "__main__": """Runs this file's tests from the command line.""" + test_stream_info_operations() + test_stream_info_guesses() test_markitdown_remote() test_markitdown_local() + test_markitdown_streams() + test_speech_transcription() test_exceptions() test_markitdown_exiftool() - # test_markitdown_llm() + test_markitdown_llm() print("All tests passed!")