Update converter API, user streams rather than file paths (#1088)
* Updated DocumentConverter interface * Updated all DocumentConverter classes * Added support for various new audio files. * Updated sample plugin to new DocumentConverter interface. * Updated project README with notes about changes, and use-cases. * Updated DocumentConverter documentation. * Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple. --------- Co-authored-by: Kenny Zhang <kzhang678@gmail.com>
This commit is contained in:
3
.gitattributes
vendored
3
.gitattributes
vendored
@@ -1 +1,2 @@
|
||||
tests/test_files/** linguist-vendored
|
||||
packages/markitdown/tests/test_files/** linguist-vendored
|
||||
packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
|
||||
|
||||
17
README.md
17
README.md
@@ -7,9 +7,11 @@
|
||||
> [!IMPORTANT]
|
||||
> Breaking changes between 0.0.1 to 0.0.2:
|
||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior.
|
||||
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
|
||||
|
||||
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
||||
It supports:
|
||||
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
|
||||
|
||||
At present, MarkItDown supports:
|
||||
|
||||
- PDF
|
||||
- PowerPoint
|
||||
@@ -23,6 +25,17 @@ It supports:
|
||||
- Youtube URLs
|
||||
- ... and more!
|
||||
|
||||
## Why Markdown?
|
||||
|
||||
Markdown is extremely close to plain text, with minimal markup or formatting, but still
|
||||
provides a way to represent important document structure. Mainstream LLMs, such as
|
||||
OpenAI's GPT-4o, natively "_speak_" Markdown, and often incorporate Markdown into their
|
||||
responses unprompted. This suggests that they have been trained on vast amounts of
|
||||
Markdown-formatted text, and understand it well. As a side benefit, Markdown conventions
|
||||
are also highly token-efficient.
|
||||
|
||||
## Installation
|
||||
|
||||
To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
|
||||
|
||||
```bash
|
||||
|
||||
@@ -10,23 +10,38 @@ This project shows how to create a sample plugin for MarkItDown. The most import
|
||||
Next, implement your custom DocumentConverter:
|
||||
|
||||
```python
|
||||
from typing import Union
|
||||
from markitdown import DocumentConverter, DocumentConverterResult
|
||||
from typing import BinaryIO, Any
|
||||
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
|
||||
|
||||
class RtfConverter(DocumentConverter):
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not an RTF file
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".rtf":
|
||||
return None
|
||||
|
||||
# Implement the conversion logic here ...
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
# Return the result
|
||||
return DocumentConverterResult(
|
||||
title=title,
|
||||
text_content=text_content,
|
||||
)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
|
||||
# Implement logic to check if the file stream is an RTF file
|
||||
# ...
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
|
||||
# Implement logic to convert the file stream to Markdown
|
||||
# ...
|
||||
raise NotImplementedError()
|
||||
```
|
||||
|
||||
Next, make sure your package implements and exports the following:
|
||||
@@ -71,10 +86,10 @@ Once the plugin package is installed, verify that it is available to MarkItDown
|
||||
markitdown --list-plugins
|
||||
```
|
||||
|
||||
To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert a PDF:
|
||||
To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert an RTF file:
|
||||
|
||||
```bash
|
||||
markitdown --use-plugins path-to-file.pdf
|
||||
markitdown --use-plugins path-to-file.rtf
|
||||
```
|
||||
|
||||
In Python, plugins can be enabled as follows:
|
||||
@@ -83,7 +98,7 @@ In Python, plugins can be enabled as follows:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown(enable_plugins=True)
|
||||
result = md.convert("path-to-file.pdf")
|
||||
result = md.convert("path-to-file.rtf")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ classifiers = [
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
]
|
||||
dependencies = [
|
||||
"markitdown",
|
||||
"markitdown>=0.0.2a2",
|
||||
"striprtf",
|
||||
]
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.0.1a2"
|
||||
__version__ = "0.0.1a3"
|
||||
|
||||
@@ -1,12 +1,26 @@
|
||||
from typing import Union
|
||||
import locale
|
||||
from typing import BinaryIO, Any
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
|
||||
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
DocumentConverter,
|
||||
DocumentConverterResult,
|
||||
StreamInfo,
|
||||
)
|
||||
|
||||
|
||||
__plugin_interface_version__ = (
|
||||
1 # The version of the plugin interface that this plugin uses
|
||||
)
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/rtf",
|
||||
"application/rtf",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".rtf"]
|
||||
|
||||
|
||||
def register_converters(markitdown: MarkItDown, **kwargs):
|
||||
"""
|
||||
@@ -22,18 +36,41 @@ class RtfConverter(DocumentConverter):
|
||||
Converts an RTF file to in the simplest possible way.
|
||||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a RTF
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".rtf":
|
||||
return None
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
# Read the RTF file
|
||||
with open(local_path, "r") as f:
|
||||
rtf = f.read()
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
# Read the file stream into an str using hte provided charset encoding, or using the system default
|
||||
encoding = stream_info.charset or locale.getpreferredencoding()
|
||||
stream_data = file_stream.read().decode(encoding)
|
||||
|
||||
# Return the result
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=rtf_to_text(rtf),
|
||||
markdown=rtf_to_text(stream_data),
|
||||
)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
import os
|
||||
import pytest
|
||||
|
||||
from markitdown import MarkItDown
|
||||
from markitdown import MarkItDown, StreamInfo
|
||||
from markitdown_sample_plugin import RtfConverter
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
@@ -15,18 +15,22 @@ RTF_TEST_STRINGS = {
|
||||
|
||||
def test_converter() -> None:
|
||||
"""Tests the RTF converter dirctly."""
|
||||
converter = RtfConverter()
|
||||
result = converter.convert(
|
||||
os.path.join(TEST_FILES_DIR, "test.rtf"), file_extension=".rtf"
|
||||
)
|
||||
with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
|
||||
converter = RtfConverter()
|
||||
result = converter.convert(
|
||||
file_stream=file_stream,
|
||||
stream_info=StreamInfo(
|
||||
mimetype="text/rtf", extension=".rtf", filename="test.rtf"
|
||||
),
|
||||
)
|
||||
|
||||
for test_string in RTF_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
for test_string in RTF_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
|
||||
def test_markitdown() -> None:
|
||||
"""Tests that MarkItDown correctly loads the plugin."""
|
||||
md = MarkItDown()
|
||||
md = MarkItDown(enable_plugins=True)
|
||||
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
|
||||
|
||||
for test_string in RTF_TEST_STRINGS:
|
||||
|
||||
@@ -26,7 +26,7 @@ classifiers = [
|
||||
dependencies = [
|
||||
"beautifulsoup4",
|
||||
"requests",
|
||||
"markdownify~=0.14.1",
|
||||
"markdownify",
|
||||
"puremagic",
|
||||
"pathvalidate",
|
||||
"charset-normalizer",
|
||||
@@ -78,11 +78,14 @@ extra-dependencies = [
|
||||
]
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
features = ["all"]
|
||||
extra-dependencies = [
|
||||
"openai",
|
||||
"mypy>=1.0.0",
|
||||
]
|
||||
|
||||
[tool.hatch.envs.types.scripts]
|
||||
check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
|
||||
check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}"
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["markitdown", "tests"]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.0.2a1"
|
||||
__version__ = "0.0.2a2"
|
||||
|
||||
@@ -3,7 +3,13 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from .__about__ import __version__
|
||||
from ._markitdown import MarkItDown
|
||||
from ._markitdown import (
|
||||
MarkItDown,
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
PRIORITY_GENERIC_FILE_FORMAT,
|
||||
)
|
||||
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||
from ._stream_info import StreamInfo
|
||||
from ._exceptions import (
|
||||
MarkItDownException,
|
||||
MissingDependencyException,
|
||||
@@ -11,7 +17,6 @@ from ._exceptions import (
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
)
|
||||
from .converters import DocumentConverter, DocumentConverterResult
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
@@ -23,4 +28,7 @@ __all__ = [
|
||||
"FailedConversionAttempt",
|
||||
"FileConversionException",
|
||||
"UnsupportedFormatException",
|
||||
"StreamInfo",
|
||||
"PRIORITY_SPECIFIC_FILE_FORMAT",
|
||||
"PRIORITY_GENERIC_FILE_FORMAT",
|
||||
]
|
||||
|
||||
108
packages/markitdown/src/markitdown/_base_converter.py
Normal file
108
packages/markitdown/src/markitdown/_base_converter.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import os
|
||||
import tempfile
|
||||
from warnings import warn
|
||||
from typing import Any, Union, BinaryIO, Optional, List
|
||||
from ._stream_info import StreamInfo
|
||||
|
||||
|
||||
class DocumentConverterResult:
|
||||
"""The result of converting a document to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
markdown: str,
|
||||
*,
|
||||
title: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the DocumentConverterResult.
|
||||
|
||||
The only required parameter is the converted Markdown text.
|
||||
The title, and any other metadata that may be added in the future, are optional.
|
||||
|
||||
Parameters:
|
||||
- markdown: The converted Markdown text.
|
||||
- title: Optional title of the document.
|
||||
"""
|
||||
self.markdown = markdown
|
||||
self.title = title
|
||||
|
||||
@property
|
||||
def text_content(self) -> str:
|
||||
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
||||
return self.markdown
|
||||
|
||||
@text_content.setter
|
||||
def text_content(self, markdown: str):
|
||||
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
||||
self.markdown = markdown
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return the converted Markdown text."""
|
||||
return self.markdown
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
"""Abstract superclass of all DocumentConverters."""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Return a quick determination on if the converter should attempt converting the document.
|
||||
This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
|
||||
In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to
|
||||
make a determination (e.g., special converters for Wikipedia, YouTube etc).
|
||||
Finally, it is conceivable that the `stream_info.filename` might be used to in cases
|
||||
where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
|
||||
|
||||
NOTE: The method signature is designed to match that of the convert() method. This provides some
|
||||
assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
|
||||
|
||||
IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final
|
||||
determination. Read operations inevitably advances the position in file_stream. In these case, the position
|
||||
MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately
|
||||
after accepts(), and will expect the file_stream to be at the original position.
|
||||
|
||||
E.g.,
|
||||
cur_pos = file_stream.tell() # Save the current position
|
||||
data = file_stream.read(100) # ... peek at the first 100 bytes, etc.
|
||||
file_stream.seek(cur_pos) # Reset the position to the original position
|
||||
|
||||
Prameters:
|
||||
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
|
||||
- kwargs: Additional keyword arguments for the converter.
|
||||
|
||||
Returns:
|
||||
- bool: True if the converter can handle the document, False otherwise.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
|
||||
)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
"""
|
||||
Convert a document to Markdown text.
|
||||
|
||||
Prameters:
|
||||
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
|
||||
- kwargs: Additional keyword arguments for the converter.
|
||||
|
||||
Returns:
|
||||
- DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
|
||||
|
||||
Raises:
|
||||
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
|
||||
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
||||
"""
|
||||
raise NotImplementedError("Subclasses must implement this method")
|
||||
@@ -68,6 +68,9 @@ class FileConversionException(MarkItDownException):
|
||||
else:
|
||||
message = f"File conversion failed after {len(attempts)} attempts:\n"
|
||||
for attempt in attempts:
|
||||
message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
|
||||
if attempt.exc_info is None:
|
||||
message += " - {type(attempt.converter).__name__} provided no execution info."
|
||||
else:
|
||||
message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
|
||||
|
||||
super().__init__(message)
|
||||
|
||||
@@ -6,8 +6,10 @@ import sys
|
||||
import tempfile
|
||||
import warnings
|
||||
import traceback
|
||||
import io
|
||||
from dataclasses import dataclass
|
||||
from importlib.metadata import entry_points
|
||||
from typing import Any, List, Optional, Union
|
||||
from typing import Any, List, Optional, Union, BinaryIO
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from warnings import warn
|
||||
@@ -16,9 +18,9 @@ from warnings import warn
|
||||
import puremagic
|
||||
import requests
|
||||
|
||||
from ._stream_info import StreamInfo, _guess_stream_info_from_stream
|
||||
|
||||
from .converters import (
|
||||
DocumentConverter,
|
||||
DocumentConverterResult,
|
||||
PlainTextConverter,
|
||||
HtmlConverter,
|
||||
RssConverter,
|
||||
@@ -32,26 +34,34 @@ from .converters import (
|
||||
XlsConverter,
|
||||
PptxConverter,
|
||||
ImageConverter,
|
||||
WavConverter,
|
||||
Mp3Converter,
|
||||
AudioConverter,
|
||||
OutlookMsgConverter,
|
||||
ZipConverter,
|
||||
DocumentIntelligenceConverter,
|
||||
)
|
||||
|
||||
from ._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
from ._exceptions import (
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
FailedConversionAttempt,
|
||||
)
|
||||
|
||||
# Override mimetype for csv to fix issue on windows
|
||||
mimetypes.add_type("text/csv", ".csv")
|
||||
|
||||
_plugins: Union[None | List[Any]] = None
|
||||
# Lower priority values are tried first.
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||
)
|
||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||
)
|
||||
|
||||
|
||||
def _load_plugins() -> Union[None | List[Any]]:
|
||||
_plugins: List[Any] = []
|
||||
|
||||
|
||||
def _load_plugins() -> List[Any]:
|
||||
"""Lazy load plugins, exiting early if already loaded."""
|
||||
global _plugins
|
||||
|
||||
@@ -71,6 +81,14 @@ def _load_plugins() -> Union[None | List[Any]]:
|
||||
return _plugins
|
||||
|
||||
|
||||
@dataclass(kw_only=True, frozen=True)
|
||||
class ConverterRegistration:
|
||||
"""A registration of a converter with its priority and other metadata."""
|
||||
|
||||
converter: DocumentConverter
|
||||
priority: float
|
||||
|
||||
|
||||
class MarkItDown:
|
||||
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
||||
This reader will convert common file-types or webpages to Markdown."""
|
||||
@@ -92,13 +110,13 @@ class MarkItDown:
|
||||
self._requests_session = requests_session
|
||||
|
||||
# TODO - remove these (see enable_builtins)
|
||||
self._llm_client = None
|
||||
self._llm_model = None
|
||||
self._exiftool_path = None
|
||||
self._style_map = None
|
||||
self._llm_client: Any = None
|
||||
self._llm_model: Union[str | None] = None
|
||||
self._exiftool_path: Union[str | None] = None
|
||||
self._style_map: Union[str | None] = None
|
||||
|
||||
# Register the converters
|
||||
self._page_converters: List[DocumentConverter] = []
|
||||
self._converters: List[ConverterRegistration] = []
|
||||
|
||||
if (
|
||||
enable_builtins is None or enable_builtins
|
||||
@@ -126,9 +144,15 @@ class MarkItDown:
|
||||
# Register converters for successful browsing operations
|
||||
# Later registrations are tried first / take higher priority than earlier registrations
|
||||
# To this end, the most specific converters should appear below the most generic converters
|
||||
self.register_converter(PlainTextConverter())
|
||||
self.register_converter(ZipConverter())
|
||||
self.register_converter(HtmlConverter())
|
||||
self.register_converter(
|
||||
PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(
|
||||
ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(
|
||||
HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(RssConverter())
|
||||
self.register_converter(WikipediaConverter())
|
||||
self.register_converter(YouTubeConverter())
|
||||
@@ -137,8 +161,7 @@ class MarkItDown:
|
||||
self.register_converter(XlsxConverter())
|
||||
self.register_converter(XlsConverter())
|
||||
self.register_converter(PptxConverter())
|
||||
self.register_converter(WavConverter())
|
||||
self.register_converter(Mp3Converter())
|
||||
self.register_converter(AudioConverter())
|
||||
self.register_converter(ImageConverter())
|
||||
self.register_converter(IpynbConverter())
|
||||
self.register_converter(PdfConverter())
|
||||
@@ -174,12 +197,17 @@ class MarkItDown:
|
||||
warn("Plugins converters are already enabled.", RuntimeWarning)
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||
self,
|
||||
source: Union[str, requests.Response, Path, BinaryIO],
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
"""
|
||||
Args:
|
||||
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
|
||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
||||
- source: can be a path (str or Path), url, or a requests.response object
|
||||
- stream_info: optional stream info to use for the conversion. If None, infer from source
|
||||
- kwargs: additional arguments to pass to the converter
|
||||
"""
|
||||
|
||||
# Local path or url
|
||||
@@ -191,68 +219,120 @@ class MarkItDown:
|
||||
):
|
||||
return self.convert_url(source, **kwargs)
|
||||
else:
|
||||
return self.convert_local(source, **kwargs)
|
||||
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||
# Path object
|
||||
elif isinstance(source, Path):
|
||||
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||
# Request response
|
||||
elif isinstance(source, requests.Response):
|
||||
return self.convert_response(source, **kwargs)
|
||||
elif isinstance(source, Path):
|
||||
return self.convert_local(source, **kwargs)
|
||||
# Binary stream
|
||||
elif (
|
||||
hasattr(source, "read")
|
||||
and callable(source.read)
|
||||
and not isinstance(source, io.TextIOBase)
|
||||
):
|
||||
return self.convert_stream(source, **kwargs)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
|
||||
)
|
||||
|
||||
def convert_local(
|
||||
self, path: Union[str, Path], **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if isinstance(path, Path):
|
||||
path = str(path)
|
||||
# Prepare a list of extensions to try (in order of priority)
|
||||
ext = kwargs.get("file_extension")
|
||||
extensions = [ext] if ext is not None else []
|
||||
|
||||
# Get extension alternatives from the path and puremagic
|
||||
base, ext = os.path.splitext(path)
|
||||
self._append_ext(extensions, ext)
|
||||
# Build a base StreamInfo object from which to start guesses
|
||||
base_stream_info = StreamInfo(
|
||||
local_path=path,
|
||||
extension=os.path.splitext(path)[1],
|
||||
filename=os.path.basename(path),
|
||||
)
|
||||
|
||||
for g in self._guess_ext_magic(path):
|
||||
self._append_ext(extensions, g)
|
||||
# Extend the base_stream_info with any additional info from the arguments
|
||||
if stream_info is not None:
|
||||
base_stream_info = base_stream_info.copy_and_update(stream_info)
|
||||
|
||||
# Convert
|
||||
return self._convert(path, extensions, **kwargs)
|
||||
if file_extension is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_stream_info = base_stream_info.copy_and_update(
|
||||
extension=file_extension
|
||||
)
|
||||
|
||||
if url is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_stream_info = base_stream_info.copy_and_update(url=url)
|
||||
|
||||
with open(path, "rb") as fh:
|
||||
# Prepare a list of configurations to try, starting with the base_stream_info
|
||||
guesses: List[StreamInfo] = [base_stream_info]
|
||||
for guess in _guess_stream_info_from_stream(
|
||||
file_stream=fh, filename_hint=path
|
||||
):
|
||||
guesses.append(base_stream_info.copy_and_update(guess))
|
||||
return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)
|
||||
|
||||
# TODO what should stream's type be?
|
||||
def convert_stream(
|
||||
self, stream: Any, **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
# Prepare a list of extensions to try (in order of priority)
|
||||
ext = kwargs.get("file_extension")
|
||||
extensions = [ext] if ext is not None else []
|
||||
self,
|
||||
stream: BinaryIO,
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
guesses: List[StreamInfo] = []
|
||||
|
||||
# Save the file locally to a temporary file. It will be deleted before this method exits
|
||||
handle, temp_path = tempfile.mkstemp()
|
||||
fh = os.fdopen(handle, "wb")
|
||||
result = None
|
||||
try:
|
||||
# Write to the temporary file
|
||||
content = stream.read()
|
||||
if isinstance(content, str):
|
||||
fh.write(content.encode("utf-8"))
|
||||
# Do we have anything on which to base a guess?
|
||||
base_guess = None
|
||||
if stream_info is not None or file_extension is not None or url is not None:
|
||||
# Start with a non-Null base guess
|
||||
if stream_info is None:
|
||||
base_guess = StreamInfo()
|
||||
else:
|
||||
fh.write(content)
|
||||
fh.close()
|
||||
base_guess = stream_info
|
||||
|
||||
# Use puremagic to check for more extension options
|
||||
for g in self._guess_ext_magic(temp_path):
|
||||
self._append_ext(extensions, g)
|
||||
if file_extension is not None:
|
||||
# Deprecated -- use stream_info
|
||||
assert base_guess is not None # for mypy
|
||||
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||
|
||||
# Convert
|
||||
result = self._convert(temp_path, extensions, **kwargs)
|
||||
# Clean up
|
||||
finally:
|
||||
try:
|
||||
fh.close()
|
||||
except Exception:
|
||||
pass
|
||||
os.unlink(temp_path)
|
||||
if url is not None:
|
||||
# Deprecated -- use stream_info
|
||||
assert base_guess is not None # for mypy
|
||||
base_guess = base_guess.copy_and_update(url=url)
|
||||
|
||||
return result
|
||||
# Append the base guess, if it's non-trivial
|
||||
if base_guess is not None:
|
||||
if base_guess.mimetype is not None or base_guess.extension is not None:
|
||||
guesses.append(base_guess)
|
||||
else:
|
||||
# Create a base guess with no information
|
||||
base_guess = StreamInfo()
|
||||
|
||||
# Create a placeholder filename to help with guessing
|
||||
placeholder_filename = None
|
||||
if base_guess.filename is not None:
|
||||
placeholder_filename = base_guess.filename
|
||||
elif base_guess.extension is not None:
|
||||
placeholder_filename = "placeholder" + base_guess.extension
|
||||
|
||||
# Add guesses based on stream content
|
||||
for guess in _guess_stream_info_from_stream(
|
||||
file_stream=stream, filename_hint=placeholder_filename
|
||||
):
|
||||
guesses.append(base_guess.copy_and_update(guess))
|
||||
|
||||
# Perform the conversion
|
||||
return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)
|
||||
|
||||
def convert_url(
|
||||
self, url: str, **kwargs: Any
|
||||
@@ -263,55 +343,94 @@ class MarkItDown:
|
||||
return self.convert_response(response, **kwargs)
|
||||
|
||||
def convert_response(
|
||||
self, response: requests.Response, **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO fix kwargs type
|
||||
# Prepare a list of extensions to try (in order of priority)
|
||||
ext = kwargs.get("file_extension")
|
||||
extensions = [ext] if ext is not None else []
|
||||
self,
|
||||
response: requests.Response,
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
# If there is a content-type header, get the mimetype and charset (if present)
|
||||
mimetype: Optional[str] = None
|
||||
charset: Optional[str] = None
|
||||
|
||||
# Guess from the mimetype
|
||||
content_type = response.headers.get("content-type", "").split(";")[0]
|
||||
self._append_ext(extensions, mimetypes.guess_extension(content_type))
|
||||
if "content-type" in response.headers:
|
||||
parts = response.headers["content-type"].split(";")
|
||||
mimetype = parts.pop(0).strip()
|
||||
for part in parts:
|
||||
if part.strip().startswith("charset="):
|
||||
_charset = part.split("=")[1].strip()
|
||||
if len(_charset) > 0:
|
||||
charset = _charset
|
||||
|
||||
# Read the content disposition if there is one
|
||||
content_disposition = response.headers.get("content-disposition", "")
|
||||
m = re.search(r"filename=([^;]+)", content_disposition)
|
||||
if m:
|
||||
base, ext = os.path.splitext(m.group(1).strip("\"'"))
|
||||
self._append_ext(extensions, ext)
|
||||
# If there is a content-disposition header, get the filename and possibly the extension
|
||||
filename: Optional[str] = None
|
||||
extension: Optional[str] = None
|
||||
if "content-disposition" in response.headers:
|
||||
m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
|
||||
if m:
|
||||
filename = m.group(1).strip("\"'")
|
||||
_, _extension = os.path.splitext(filename)
|
||||
if len(_extension) > 0:
|
||||
extension = _extension
|
||||
|
||||
# Read from the extension from the path
|
||||
base, ext = os.path.splitext(urlparse(response.url).path)
|
||||
self._append_ext(extensions, ext)
|
||||
# If there is still no filename, try to read it from the url
|
||||
if filename is None:
|
||||
parsed_url = urlparse(response.url)
|
||||
_, _extension = os.path.splitext(parsed_url.path)
|
||||
if len(_extension) > 0: # Looks like this might be a file!
|
||||
filename = os.path.basename(parsed_url.path)
|
||||
extension = _extension
|
||||
|
||||
# Save the file locally to a temporary file. It will be deleted before this method exits
|
||||
handle, temp_path = tempfile.mkstemp()
|
||||
fh = os.fdopen(handle, "wb")
|
||||
result = None
|
||||
try:
|
||||
# Download the file
|
||||
for chunk in response.iter_content(chunk_size=512):
|
||||
fh.write(chunk)
|
||||
fh.close()
|
||||
# Create an initial guess from all this information
|
||||
base_guess = StreamInfo(
|
||||
mimetype=mimetype,
|
||||
charset=charset,
|
||||
filename=filename,
|
||||
extension=extension,
|
||||
url=response.url,
|
||||
)
|
||||
|
||||
# Use puremagic to check for more extension options
|
||||
for g in self._guess_ext_magic(temp_path):
|
||||
self._append_ext(extensions, g)
|
||||
# Update with any additional info from the arguments
|
||||
if stream_info is not None:
|
||||
base_guess = base_guess.copy_and_update(stream_info)
|
||||
if file_extension is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||
if url is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_guess = base_guess.copy_and_update(url=url)
|
||||
|
||||
# Convert
|
||||
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
|
||||
# Clean up
|
||||
finally:
|
||||
try:
|
||||
fh.close()
|
||||
except Exception:
|
||||
pass
|
||||
os.unlink(temp_path)
|
||||
# Add the guess if its non-trivial
|
||||
guesses: List[StreamInfo] = []
|
||||
if base_guess.mimetype is not None or base_guess.extension is not None:
|
||||
guesses.append(base_guess)
|
||||
|
||||
return result
|
||||
# Read into BytesIO
|
||||
buffer = io.BytesIO()
|
||||
for chunk in response.iter_content(chunk_size=512):
|
||||
buffer.write(chunk)
|
||||
buffer.seek(0)
|
||||
|
||||
# Create a placeholder filename to help with guessing
|
||||
placeholder_filename = None
|
||||
if base_guess.filename is not None:
|
||||
placeholder_filename = base_guess.filename
|
||||
elif base_guess.extension is not None:
|
||||
placeholder_filename = "placeholder" + base_guess.extension
|
||||
|
||||
# Add guesses based on stream content
|
||||
for guess in _guess_stream_info_from_stream(
|
||||
file_stream=buffer, filename_hint=placeholder_filename
|
||||
):
|
||||
guesses.append(base_guess.copy_and_update(guess))
|
||||
|
||||
# Convert
|
||||
return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)
|
||||
|
||||
def _convert(
|
||||
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
||||
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
|
||||
) -> DocumentConverterResult:
|
||||
res: Union[None, DocumentConverterResult] = None
|
||||
|
||||
@@ -321,19 +440,21 @@ class MarkItDown:
|
||||
# Create a copy of the page_converters list, sorted by priority.
|
||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||
sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
|
||||
sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
|
||||
|
||||
# Remember the initial stream position so that we can return to it
|
||||
cur_pos = file_stream.tell()
|
||||
|
||||
for stream_info in stream_info_guesses + [StreamInfo()]:
|
||||
for converter_registration in sorted_registrations:
|
||||
converter = converter_registration.converter
|
||||
# Sanity check -- make sure the cur_pos is still the same
|
||||
assert (
|
||||
cur_pos == file_stream.tell()
|
||||
), f"File stream position should NOT change between guess iterations"
|
||||
|
||||
for ext in extensions + [None]: # Try last with no extension
|
||||
for converter in sorted_converters:
|
||||
_kwargs = copy.deepcopy(kwargs)
|
||||
|
||||
# Overwrite file_extension appropriately
|
||||
if ext is None:
|
||||
if "file_extension" in _kwargs:
|
||||
del _kwargs["file_extension"]
|
||||
else:
|
||||
_kwargs.update({"file_extension": ext})
|
||||
|
||||
# Copy any additional global options
|
||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||
_kwargs["llm_client"] = self._llm_client
|
||||
@@ -348,17 +469,40 @@ class MarkItDown:
|
||||
_kwargs["exiftool_path"] = self._exiftool_path
|
||||
|
||||
# Add the list of converters for nested processing
|
||||
_kwargs["_parent_converters"] = self._page_converters
|
||||
_kwargs["_parent_converters"] = self._converters
|
||||
|
||||
# If we hit an error log it and keep trying
|
||||
# Add legaxy kwargs
|
||||
if stream_info is not None:
|
||||
if stream_info.extension is not None:
|
||||
_kwargs["file_extension"] = stream_info.extension
|
||||
|
||||
if stream_info.url is not None:
|
||||
_kwargs["url"] = stream_info.url
|
||||
|
||||
# Check if the converter will accept the file, and if so, try to convert it
|
||||
_accepts = False
|
||||
try:
|
||||
res = converter.convert(local_path, **_kwargs)
|
||||
except Exception:
|
||||
failed_attempts.append(
|
||||
FailedConversionAttempt(
|
||||
converter=converter, exc_info=sys.exc_info()
|
||||
_accepts = converter.accepts(file_stream, stream_info, **_kwargs)
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# accept() should not have changed the file stream position
|
||||
assert (
|
||||
cur_pos == file_stream.tell()
|
||||
), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
|
||||
|
||||
# Attempt the conversion
|
||||
if _accepts:
|
||||
try:
|
||||
res = converter.convert(file_stream, stream_info, **_kwargs)
|
||||
except Exception:
|
||||
failed_attempts.append(
|
||||
FailedConversionAttempt(
|
||||
converter=converter, exc_info=sys.exc_info()
|
||||
)
|
||||
)
|
||||
)
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
if res is not None:
|
||||
# Normalize the content
|
||||
@@ -366,8 +510,6 @@ class MarkItDown:
|
||||
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
||||
)
|
||||
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
||||
|
||||
# Todo
|
||||
return res
|
||||
|
||||
# If we got this far without success, report any exceptions
|
||||
@@ -376,61 +518,9 @@ class MarkItDown:
|
||||
|
||||
# Nothing can handle it!
|
||||
raise UnsupportedFormatException(
|
||||
f"Could not convert '{local_path}' to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||
)
|
||||
|
||||
def _append_ext(self, extensions, ext):
|
||||
"""Append a unique non-None, non-empty extension to a list of extensions."""
|
||||
if ext is None:
|
||||
return
|
||||
ext = ext.strip()
|
||||
if ext == "":
|
||||
return
|
||||
if ext in extensions:
|
||||
return
|
||||
extensions.append(ext)
|
||||
|
||||
def _guess_ext_magic(self, path):
|
||||
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
||||
# Use puremagic to guess
|
||||
try:
|
||||
guesses = puremagic.magic_file(path)
|
||||
|
||||
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
||||
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
||||
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
||||
# (space, tab, newline, carriage return, vertical tab, form feed).
|
||||
if len(guesses) == 0:
|
||||
with open(path, "rb") as file:
|
||||
while True:
|
||||
char = file.read(1)
|
||||
if not char: # End of file
|
||||
break
|
||||
if not char.isspace():
|
||||
file.seek(file.tell() - 1)
|
||||
break
|
||||
try:
|
||||
guesses = puremagic.magic_stream(file)
|
||||
except puremagic.main.PureError:
|
||||
pass
|
||||
|
||||
extensions = list()
|
||||
for g in guesses:
|
||||
ext = g.extension.strip()
|
||||
if len(ext) > 0:
|
||||
if not ext.startswith("."):
|
||||
ext = "." + ext
|
||||
if ext not in extensions:
|
||||
extensions.append(ext)
|
||||
return extensions
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except IsADirectoryError:
|
||||
pass
|
||||
except PermissionError:
|
||||
pass
|
||||
return []
|
||||
|
||||
def register_page_converter(self, converter: DocumentConverter) -> None:
|
||||
"""DEPRECATED: User register_converter instead."""
|
||||
warn(
|
||||
@@ -439,6 +529,34 @@ class MarkItDown:
|
||||
)
|
||||
self.register_converter(converter)
|
||||
|
||||
def register_converter(self, converter: DocumentConverter) -> None:
|
||||
"""Register a page text converter."""
|
||||
self._page_converters.insert(0, converter)
|
||||
def register_converter(
|
||||
self,
|
||||
converter: DocumentConverter,
|
||||
*,
|
||||
priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
) -> None:
|
||||
"""
|
||||
Register a DocumentConverter with a given priority.
|
||||
|
||||
Priorities work as follows: By default, most converters get priority
|
||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||
is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
|
||||
priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
|
||||
being tried first (i.e., higher priority).
|
||||
|
||||
Just prior to conversion, the converters are sorted by priority, using
|
||||
a stable sort. This means that converters with the same priority will
|
||||
remain in the same order, with the most recently registered converters
|
||||
appearing first.
|
||||
|
||||
We have tight control over the order of built-in converters, but
|
||||
plugins can register converters in any order. The registration's priority
|
||||
field reasserts some control over the order of converters.
|
||||
|
||||
Plugins can register converters with any priority, to appear before or
|
||||
after the built-ins. For example, a plugin with priority 9 will run
|
||||
before the PlainTextConverter, but after the built-in converters.
|
||||
"""
|
||||
self._converters.insert(
|
||||
0, ConverterRegistration(converter=converter, priority=priority)
|
||||
)
|
||||
|
||||
122
packages/markitdown/src/markitdown/_stream_info.py
Normal file
122
packages/markitdown/src/markitdown/_stream_info.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import puremagic
|
||||
import mimetypes
|
||||
import os
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Optional, BinaryIO, List, TypeVar, Type
|
||||
|
||||
# Mimetype substitutions table
|
||||
MIMETYPE_SUBSTITUTIONS = {
|
||||
"application/excel": "application/vnd.ms-excel",
|
||||
"application/mspowerpoint": "application/vnd.ms-powerpoint",
|
||||
}
|
||||
|
||||
|
||||
@dataclass(kw_only=True, frozen=True)
|
||||
class StreamInfo:
|
||||
"""The StreamInfo class is used to store information about a file stream.
|
||||
All fields can be None, and will depend on how the stream was opened.
|
||||
"""
|
||||
|
||||
mimetype: Optional[str] = None
|
||||
extension: Optional[str] = None
|
||||
charset: Optional[str] = None
|
||||
filename: Optional[
|
||||
str
|
||||
] = None # From local path, url, or Content-Disposition header
|
||||
local_path: Optional[str] = None # If read from disk
|
||||
url: Optional[str] = None # If read from url
|
||||
|
||||
def copy_and_update(self, *args, **kwargs):
|
||||
"""Copy the StreamInfo object and update it with the given StreamInfo
|
||||
instance and/or other keyword arguments."""
|
||||
new_info = asdict(self)
|
||||
|
||||
for si in args:
|
||||
assert isinstance(si, StreamInfo)
|
||||
new_info.update({k: v for k, v in asdict(si).items() if v is not None})
|
||||
|
||||
if len(kwargs) > 0:
|
||||
new_info.update(kwargs)
|
||||
|
||||
return StreamInfo(**new_info)
|
||||
|
||||
|
||||
# Behavior subject to change.
|
||||
# Do not rely on this outside of this module.
|
||||
def _guess_stream_info_from_stream(
|
||||
file_stream: BinaryIO,
|
||||
*,
|
||||
filename_hint: Optional[str] = None,
|
||||
) -> List[StreamInfo]:
|
||||
"""
|
||||
Guess StreamInfo properties (mostly mimetype and extension) from a stream.
|
||||
|
||||
Args:
|
||||
- stream: The stream to guess the StreamInfo from.
|
||||
- filename_hint [Optional]: A filename hint to help with the guessing (may be a placeholder, and not actually be the file name)
|
||||
|
||||
Returns a list of StreamInfo objects in order of confidence.
|
||||
"""
|
||||
guesses: List[StreamInfo] = []
|
||||
|
||||
# Add a guess purely based on the filename hint
|
||||
if filename_hint:
|
||||
try:
|
||||
# Requires Python 3.13+
|
||||
mimetype, _ = mimetypes.guess_file_type(filename_hint) # type: ignore
|
||||
except AttributeError:
|
||||
mimetype, _ = mimetypes.guess_type(filename_hint)
|
||||
|
||||
if mimetype:
|
||||
guesses.append(
|
||||
StreamInfo(
|
||||
mimetype=mimetype, extension=os.path.splitext(filename_hint)[1]
|
||||
)
|
||||
)
|
||||
|
||||
def _puremagic(
|
||||
file_stream, filename_hint
|
||||
) -> List[puremagic.main.PureMagicWithConfidence]:
|
||||
"""Wrap guesses to handle exceptions."""
|
||||
try:
|
||||
return puremagic.magic_stream(file_stream, filename=filename_hint)
|
||||
except puremagic.main.PureError as e:
|
||||
return []
|
||||
|
||||
cur_pos = file_stream.tell()
|
||||
type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
|
||||
if len(type_guesses) == 0:
|
||||
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
||||
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
||||
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
||||
# (space, tab, newline, carriage return, vertical tab, form feed).
|
||||
|
||||
# Eat all the leading whitespace
|
||||
file_stream.seek(cur_pos)
|
||||
while True:
|
||||
char = file_stream.read(1)
|
||||
if not char: # End of file
|
||||
break
|
||||
if not char.isspace():
|
||||
file_stream.seek(file_stream.tell() - 1)
|
||||
break
|
||||
|
||||
# Try again
|
||||
type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Convert and return the guesses
|
||||
for guess in type_guesses:
|
||||
kwargs: dict[str, str] = {}
|
||||
if guess.extension:
|
||||
kwargs["extension"] = guess.extension
|
||||
if guess.mime_type:
|
||||
kwargs["mimetype"] = MIMETYPE_SUBSTITUTIONS.get(
|
||||
guess.mime_type, guess.mime_type
|
||||
)
|
||||
if len(kwargs) > 0:
|
||||
# We don't add the filename_hint, because sometimes it's just a placeholder,
|
||||
# and, in any case, doesn't add new information.
|
||||
guesses.append(StreamInfo(**kwargs))
|
||||
|
||||
return guesses
|
||||
@@ -2,7 +2,6 @@
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._plain_text_converter import PlainTextConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._rss_converter import RssConverter
|
||||
@@ -15,15 +14,12 @@ from ._docx_converter import DocxConverter
|
||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||
from ._pptx_converter import PptxConverter
|
||||
from ._image_converter import ImageConverter
|
||||
from ._wav_converter import WavConverter
|
||||
from ._mp3_converter import Mp3Converter
|
||||
from ._audio_converter import AudioConverter
|
||||
from ._outlook_msg_converter import OutlookMsgConverter
|
||||
from ._zip_converter import ZipConverter
|
||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||
|
||||
__all__ = [
|
||||
"DocumentConverter",
|
||||
"DocumentConverterResult",
|
||||
"PlainTextConverter",
|
||||
"HtmlConverter",
|
||||
"RssConverter",
|
||||
@@ -37,8 +33,7 @@ __all__ = [
|
||||
"XlsConverter",
|
||||
"PptxConverter",
|
||||
"ImageConverter",
|
||||
"WavConverter",
|
||||
"Mp3Converter",
|
||||
"AudioConverter",
|
||||
"OutlookMsgConverter",
|
||||
"ZipConverter",
|
||||
"DocumentIntelligenceConverter",
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
import io
|
||||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from ._exiftool import exiftool_metadata
|
||||
from ._transcribe_audio import transcribe_audio
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"audio/x-wav",
|
||||
"audio/mpeg",
|
||||
"video/mp4",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".wav",
|
||||
".mp3",
|
||||
".m4a",
|
||||
".mp4",
|
||||
]
|
||||
|
||||
|
||||
class AudioConverter(DocumentConverter):
|
||||
"""
|
||||
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
metadata = exiftool_metadata(
|
||||
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
||||
)
|
||||
if metadata:
|
||||
for f in [
|
||||
"Title",
|
||||
"Artist",
|
||||
"Author",
|
||||
"Band",
|
||||
"Album",
|
||||
"Genre",
|
||||
"Track",
|
||||
"DateTimeOriginal",
|
||||
"CreateDate",
|
||||
# "Duration", -- Wrong values when read from memory
|
||||
"NumChannels",
|
||||
"SampleRate",
|
||||
"AvgBytesPerSec",
|
||||
"BitsPerSample",
|
||||
]:
|
||||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Figure out the audio format for transcription
|
||||
if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
|
||||
audio_format = "wav"
|
||||
elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
|
||||
audio_format = "mp3"
|
||||
elif (
|
||||
stream_info.extension in [".mp4", ".m4a"]
|
||||
or stream_info.mimetype == "video/mp4"
|
||||
):
|
||||
audio_format = "mp4"
|
||||
else:
|
||||
audio_format = None
|
||||
|
||||
# Transcribe
|
||||
if audio_format:
|
||||
try:
|
||||
transcript = transcribe_audio(file_stream, audio_format=audio_format)
|
||||
if transcript:
|
||||
md_content += "\n\n### Audio Transcript:\n" + transcript
|
||||
except MissingDependencyException:
|
||||
pass
|
||||
|
||||
# Return the result
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
@@ -1,63 +0,0 @@
|
||||
from typing import Any, Union
|
||||
|
||||
|
||||
class DocumentConverterResult:
|
||||
"""The result of converting a document to text."""
|
||||
|
||||
def __init__(self, title: Union[str, None] = None, text_content: str = ""):
|
||||
self.title: Union[str, None] = title
|
||||
self.text_content: str = text_content
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
"""Abstract superclass of all DocumentConverters."""
|
||||
|
||||
# Lower priority values are tried first.
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||
)
|
||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||
)
|
||||
|
||||
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
|
||||
"""
|
||||
Initialize the DocumentConverter with a given priority.
|
||||
|
||||
Priorities work as follows: By default, most converters get priority
|
||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
|
||||
with lower values being tried first (i.e., higher priority).
|
||||
|
||||
Just prior to conversion, the converters are sorted by priority, using
|
||||
a stable sort. This means that converters with the same priority will
|
||||
remain in the same order, with the most recently registered converters
|
||||
appearing first.
|
||||
|
||||
We have tight control over the order of built-in converters, but
|
||||
plugins can register converters in any order. A converter's priority
|
||||
field reasserts some control over the order of converters.
|
||||
|
||||
Plugins can register converters with any priority, to appear before or
|
||||
after the built-ins. For example, a plugin with priority 9 will run
|
||||
before the PlainTextConverter, but after the built-in converters.
|
||||
"""
|
||||
self._priority = priority
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
raise NotImplementedError("Subclasses must implement this method")
|
||||
|
||||
@property
|
||||
def priority(self) -> float:
|
||||
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
|
||||
return self._priority
|
||||
|
||||
@priority.setter
|
||||
def priority(self, value: float):
|
||||
self._priority = value
|
||||
|
||||
@priority.deleter
|
||||
def priority(self):
|
||||
raise AttributeError("Cannot delete the priority attribute")
|
||||
@@ -1,14 +1,24 @@
|
||||
# type: ignore
|
||||
import base64
|
||||
import io
|
||||
import re
|
||||
|
||||
from typing import Union
|
||||
import base64
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
from typing import Any, BinaryIO, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
|
||||
class BingSerpConverter(DocumentConverter):
|
||||
"""
|
||||
@@ -16,28 +26,47 @@ class BingSerpConverter(DocumentConverter):
|
||||
NOTE: It is better to use the Bing API
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Make sure we're dealing with HTML content *from* Bing.
|
||||
"""
|
||||
|
||||
url = stream_info.url or ""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a Bing SERP
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".html", ".htm"]:
|
||||
return None
|
||||
url = kwargs.get("url", "")
|
||||
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
||||
return None
|
||||
# Not a Bing SERP URL
|
||||
return False
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Not HTML content
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse the query parameters
|
||||
parsed_params = parse_qs(urlparse(url).query)
|
||||
parsed_params = parse_qs(urlparse(stream_info.url).query)
|
||||
query = parsed_params.get("q", [""])[0]
|
||||
|
||||
# Parse the file
|
||||
soup = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Clean up some formatting
|
||||
for tptt in soup.find_all(class_="tptt"):
|
||||
@@ -81,6 +110,6 @@ class BingSerpConverter(DocumentConverter):
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=None if soup.title is None else soup.title.string,
|
||||
text_content=webpage_text,
|
||||
)
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
from typing import Any, Union
|
||||
import re
|
||||
import sys
|
||||
import re
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException
|
||||
from typing import BinaryIO, Any, List
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
@@ -26,17 +29,50 @@ except ImportError:
|
||||
CONTENT_FORMAT = "markdown"
|
||||
|
||||
|
||||
OFFICE_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml",
|
||||
"application/xhtml",
|
||||
"text/html",
|
||||
]
|
||||
|
||||
OTHER_MIME_TYPE_PREFIXES = [
|
||||
"application/pdf",
|
||||
"application/x-pdf",
|
||||
"text/html",
|
||||
"image/",
|
||||
]
|
||||
|
||||
OFFICE_FILE_EXTENSIONS = [
|
||||
".docx",
|
||||
".xlsx",
|
||||
".pptx",
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
OTHER_FILE_EXTENSIONS = [
|
||||
".pdf",
|
||||
".jpeg",
|
||||
".jpg",
|
||||
".png",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".heif",
|
||||
]
|
||||
|
||||
|
||||
class DocumentIntelligenceConverter(DocumentConverter):
|
||||
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
endpoint: str,
|
||||
api_version: str = "2024-07-31-preview",
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
super().__init__()
|
||||
|
||||
# Raise an error if the dependencies are not available.
|
||||
# This is different than other converters since this one isn't even instantiated
|
||||
@@ -44,9 +80,11 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
@@ -55,55 +93,62 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||
api_version=self.api_version,
|
||||
credential=DefaultAzureCredential(),
|
||||
)
|
||||
self._priority = priority
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
|
||||
"""
|
||||
Helper needed to determine which analysis features to use.
|
||||
Certain document analysis features are not availiable for
|
||||
office filetypes (.xlsx, .pptx, .html, .docx)
|
||||
"""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in OFFICE_FILE_EXTENSIONS:
|
||||
return []
|
||||
|
||||
for prefix in OFFICE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return []
|
||||
|
||||
return [
|
||||
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
||||
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
|
||||
]
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if extension is not supported by Document Intelligence
|
||||
extension = kwargs.get("file_extension", "")
|
||||
docintel_extensions = [
|
||||
".pdf",
|
||||
".docx",
|
||||
".xlsx",
|
||||
".pptx",
|
||||
".html",
|
||||
".jpeg",
|
||||
".jpg",
|
||||
".png",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".heif",
|
||||
]
|
||||
if extension.lower() not in docintel_extensions:
|
||||
return None
|
||||
|
||||
# Get the bytestring for the local path
|
||||
with open(local_path, "rb") as f:
|
||||
file_bytes = f.read()
|
||||
|
||||
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
|
||||
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
|
||||
analysis_features = []
|
||||
else:
|
||||
analysis_features = [
|
||||
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
||||
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
|
||||
]
|
||||
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Extract the text using Azure Document Intelligence
|
||||
poller = self.doc_intel_client.begin_analyze_document(
|
||||
model_id="prebuilt-layout",
|
||||
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
|
||||
features=analysis_features,
|
||||
body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
|
||||
features=self._analysis_features(stream_info),
|
||||
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
||||
)
|
||||
result: AnalyzeResult = poller.result()
|
||||
|
||||
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
|
||||
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=markdown_text,
|
||||
)
|
||||
return DocumentConverterResult(markdown=markdown_text)
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
import sys
|
||||
|
||||
from typing import Union
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
from ._base import (
|
||||
DocumentConverterResult,
|
||||
)
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
@@ -20,22 +17,46 @@ except ImportError:
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".docx"]
|
||||
|
||||
|
||||
class DocxConverter(HtmlConverter):
|
||||
"""
|
||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a DOCX
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".docx":
|
||||
return None
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
@@ -44,16 +65,13 @@ class DocxConverter(HtmlConverter):
|
||||
extension=".docx",
|
||||
feature="docx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
result = None
|
||||
with open(local_path, "rb") as docx_file:
|
||||
style_map = kwargs.get("style_map", None)
|
||||
|
||||
result = mammoth.convert_to_html(docx_file, style_map=style_map)
|
||||
html_content = result.value
|
||||
result = self._convert(html_content)
|
||||
|
||||
return result
|
||||
style_map = kwargs.get("style_map", None)
|
||||
return self._html_converter.convert_string(
|
||||
mammoth.convert_to_html(file_stream, style_map=style_map).value
|
||||
)
|
||||
|
||||
44
packages/markitdown/src/markitdown/converters/_exiftool.py
Normal file
44
packages/markitdown/src/markitdown/converters/_exiftool.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import json
|
||||
import subprocess
|
||||
import locale
|
||||
import sys
|
||||
import shutil
|
||||
import os
|
||||
import warnings
|
||||
from typing import BinaryIO, Optional, Any
|
||||
|
||||
|
||||
def exiftool_metadata(
|
||||
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
|
||||
) -> Any: # Need a better type for json data
|
||||
# Check if we have a valid pointer to exiftool
|
||||
if not exiftool_path:
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
if which_exiftool:
|
||||
warnings.warn(
|
||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
||||
|
||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
||||
|
||||
This warning will be removed in future releases.
|
||||
""",
|
||||
DeprecationWarning,
|
||||
)
|
||||
# Nothing to do
|
||||
return {}
|
||||
|
||||
# Run exiftool
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
output = subprocess.run(
|
||||
[exiftool_path, "-json", "-"],
|
||||
input=file_stream.read(),
|
||||
capture_output=True,
|
||||
text=False,
|
||||
).stdout
|
||||
|
||||
return json.loads(
|
||||
output.decode(locale.getpreferredencoding(False)),
|
||||
)[0]
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
@@ -1,37 +1,52 @@
|
||||
from typing import Any, Union
|
||||
import io
|
||||
from typing import Any, BinaryIO, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
|
||||
class HtmlConverter(DocumentConverter):
|
||||
"""Anything with content type text/html"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not html
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".html", ".htm"]:
|
||||
return None
|
||||
|
||||
result = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
result = self._convert(fh.read())
|
||||
|
||||
return result
|
||||
|
||||
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
||||
"""Helper function that converts an HTML string."""
|
||||
|
||||
# Parse the string
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Remove javascript and style blocks
|
||||
for script in soup(["script", "style"]):
|
||||
@@ -51,6 +66,25 @@ class HtmlConverter(DocumentConverter):
|
||||
webpage_text = webpage_text.strip()
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=None if soup.title is None else soup.title.string,
|
||||
text_content=webpage_text,
|
||||
)
|
||||
|
||||
def convert_string(
|
||||
self, html_content: str, *, url: Optional[str] = None, **kwargs
|
||||
) -> DocumentConverterResult:
|
||||
"""
|
||||
Non-standard convenience method to convert a string to markdown.
|
||||
Given that many converters produce HTML as intermediate output, this
|
||||
allows for easy conversion of HTML to markdown.
|
||||
"""
|
||||
return self.convert(
|
||||
file_stream=io.BytesIO(html_content.encode("utf-8")),
|
||||
stream_info=StreamInfo(
|
||||
mimetype="text/html",
|
||||
extension=".html",
|
||||
charset="utf-8",
|
||||
url=url,
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -1,30 +1,53 @@
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._media_converter import MediaConverter
|
||||
from typing import BinaryIO, Any, Union
|
||||
import base64
|
||||
import mimetypes
|
||||
from ._exiftool import exiftool_metadata
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"image/jpeg",
|
||||
"image/png",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"]
|
||||
|
||||
|
||||
class ImageConverter(MediaConverter):
|
||||
class ImageConverter(DocumentConverter):
|
||||
"""
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not an image
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
||||
return None
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
||||
metadata = exiftool_metadata(
|
||||
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
||||
)
|
||||
|
||||
if metadata:
|
||||
for f in [
|
||||
@@ -42,39 +65,59 @@ class ImageConverter(MediaConverter):
|
||||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Try describing the image with GPTV
|
||||
# Try describing the image with GPT
|
||||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
if llm_client is not None and llm_model is not None:
|
||||
md_content += (
|
||||
"\n# Description:\n"
|
||||
+ self._get_llm_description(
|
||||
local_path,
|
||||
extension,
|
||||
llm_client,
|
||||
llm_model,
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
).strip()
|
||||
+ "\n"
|
||||
llm_description = self._get_llm_description(
|
||||
file_stream,
|
||||
stream_info,
|
||||
client=llm_client,
|
||||
model=llm_model,
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
)
|
||||
|
||||
if llm_description is not None:
|
||||
md_content += "\n# Description:\n" + llm_description.strip() + "\n"
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content,
|
||||
markdown=md_content,
|
||||
)
|
||||
|
||||
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
|
||||
def _get_llm_description(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
*,
|
||||
client,
|
||||
model,
|
||||
prompt=None,
|
||||
) -> Union[None, str]:
|
||||
if prompt is None or prompt.strip() == "":
|
||||
prompt = "Write a detailed caption for this image."
|
||||
|
||||
data_uri = ""
|
||||
with open(local_path, "rb") as image_file:
|
||||
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
||||
if content_type is None:
|
||||
content_type = "image/jpeg"
|
||||
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
||||
data_uri = f"data:{content_type};base64,{image_base64}"
|
||||
# Get the content type
|
||||
content_type = stream_info.mimetype
|
||||
if not content_type:
|
||||
content_type, _ = mimetypes.guess_type(
|
||||
"_dummy" + (stream_info.extension or "")
|
||||
)
|
||||
if not content_type:
|
||||
content_type = "application/octet-stream"
|
||||
|
||||
# Convert to base64
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
|
||||
except Exception as e:
|
||||
return None
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Prepare the data-uri
|
||||
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||
|
||||
# Prepare the OpenAI API request
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -90,5 +133,6 @@ class ImageConverter(MediaConverter):
|
||||
}
|
||||
]
|
||||
|
||||
# Call the OpenAI API
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
return response.choices[0].message.content
|
||||
|
||||
@@ -1,39 +1,62 @@
|
||||
from typing import BinaryIO, Any
|
||||
import json
|
||||
from typing import Any, Union
|
||||
|
||||
from ._base import (
|
||||
DocumentConverter,
|
||||
DocumentConverterResult,
|
||||
)
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import FileConversionException
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
CANDIDATE_MIME_TYPE_PREFIXES = [
|
||||
"application/json",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
|
||||
|
||||
|
||||
class IpynbConverter(DocumentConverter):
|
||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
# Read further to see if it's a notebook
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
encoding = stream_info.charset or "utf-8"
|
||||
notebook_content = file_stream.read().decode(encoding)
|
||||
return (
|
||||
"nbformat" in notebook_content
|
||||
and "nbformat_minor" in notebook_content
|
||||
)
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not ipynb
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".ipynb":
|
||||
return None
|
||||
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse and convert the notebook
|
||||
result = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
notebook_content = json.load(fh)
|
||||
result = self._convert(notebook_content)
|
||||
|
||||
return result
|
||||
encoding = stream_info.charset or "utf-8"
|
||||
notebook_content = file_stream.read().decode(encoding=encoding)
|
||||
return self._convert(json.loads(notebook_content))
|
||||
|
||||
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
|
||||
def _convert(self, notebook_content: dict) -> DocumentConverterResult:
|
||||
"""Helper function that converts notebook JSON content to Markdown."""
|
||||
try:
|
||||
md_output = []
|
||||
@@ -65,8 +88,8 @@ class IpynbConverter(DocumentConverter):
|
||||
title = notebook_content.get("metadata", {}).get("title", title)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_text,
|
||||
title=title,
|
||||
text_content=md_text,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -0,0 +1,50 @@
|
||||
from typing import BinaryIO, Any, Union
|
||||
import base64
|
||||
import mimetypes
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
|
||||
def llm_caption(
|
||||
file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None
|
||||
) -> Union[None, str]:
|
||||
if prompt is None or prompt.strip() == "":
|
||||
prompt = "Write a detailed caption for this image."
|
||||
|
||||
# Get the content type
|
||||
content_type = stream_info.mimetype
|
||||
if not content_type:
|
||||
content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
|
||||
if not content_type:
|
||||
content_type = "application/octet-stream"
|
||||
|
||||
# Convert to base64
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
|
||||
except Exception as e:
|
||||
return None
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Prepare the data-uri
|
||||
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||
|
||||
# Prepare the OpenAI API request
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": data_uri,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# Call the OpenAI API
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
return response.choices[0].message.content
|
||||
@@ -1,7 +1,7 @@
|
||||
import re
|
||||
import markdownify
|
||||
|
||||
from typing import Any
|
||||
from typing import Any, Optional
|
||||
from urllib.parse import quote, unquote, urlparse, urlunparse
|
||||
|
||||
|
||||
@@ -20,7 +20,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||
# Explicitly cast options to the expected type if necessary
|
||||
super().__init__(**options)
|
||||
|
||||
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
|
||||
def convert_hn(
|
||||
self,
|
||||
n: int,
|
||||
el: Any,
|
||||
text: str,
|
||||
convert_as_inline: Optional[bool] = False,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""Same as usual, but be sure to start with a new line"""
|
||||
if not convert_as_inline:
|
||||
if not re.search(r"^\n", text):
|
||||
@@ -28,7 +35,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||
|
||||
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
||||
|
||||
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
|
||||
def convert_a(
|
||||
self,
|
||||
el: Any,
|
||||
text: str,
|
||||
convert_as_inline: Optional[bool] = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
||||
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
||||
if not text:
|
||||
@@ -68,7 +81,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||
else text
|
||||
)
|
||||
|
||||
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
|
||||
def convert_img(
|
||||
self,
|
||||
el: Any,
|
||||
text: str,
|
||||
convert_as_inline: Optional[bool] = False,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""Same as usual converter, but removes data URIs"""
|
||||
|
||||
alt = el.attrs.get("alt", None) or ""
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
import subprocess
|
||||
import shutil
|
||||
import json
|
||||
from warnings import warn
|
||||
|
||||
from ._base import DocumentConverter
|
||||
|
||||
|
||||
class MediaConverter(DocumentConverter):
|
||||
"""
|
||||
Abstract class for multi-modal media (e.g., images and audio)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def _get_metadata(self, local_path, exiftool_path=None):
|
||||
if not exiftool_path:
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
if which_exiftool:
|
||||
warn(
|
||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
||||
|
||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
||||
|
||||
This warning will be removed in future releases.
|
||||
""",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
return None
|
||||
else:
|
||||
if True:
|
||||
result = subprocess.run(
|
||||
[exiftool_path, "-json", local_path], capture_output=True, text=True
|
||||
).stdout
|
||||
return json.loads(result)[0]
|
||||
# except Exception:
|
||||
# return None
|
||||
@@ -1,89 +0,0 @@
|
||||
import tempfile
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._wav_converter import WavConverter
|
||||
from warnings import resetwarnings, catch_warnings
|
||||
|
||||
# Optional Transcription support
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||
try:
|
||||
# Using warnings' catch_warnings to catch
|
||||
# pydub's warning of ffmpeg or avconv missing
|
||||
with catch_warnings(record=True) as w:
|
||||
import pydub
|
||||
|
||||
if w:
|
||||
raise ModuleNotFoundError
|
||||
import speech_recognition as sr
|
||||
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
finally:
|
||||
resetwarnings()
|
||||
|
||||
|
||||
class Mp3Converter(WavConverter):
|
||||
"""
|
||||
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a MP3
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".mp3":
|
||||
return None
|
||||
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
||||
if metadata:
|
||||
for f in [
|
||||
"Title",
|
||||
"Artist",
|
||||
"Author",
|
||||
"Band",
|
||||
"Album",
|
||||
"Genre",
|
||||
"Track",
|
||||
"DateTimeOriginal",
|
||||
"CreateDate",
|
||||
"Duration",
|
||||
]:
|
||||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Transcribe
|
||||
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
||||
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
||||
os.close(handle)
|
||||
try:
|
||||
sound = pydub.AudioSegment.from_mp3(local_path)
|
||||
sound.export(temp_path, format="wav")
|
||||
|
||||
_args = dict()
|
||||
_args.update(kwargs)
|
||||
_args["file_extension"] = ".wav"
|
||||
|
||||
try:
|
||||
transcript = super()._transcribe_audio(temp_path).strip()
|
||||
md_content += "\n\n### Audio Transcript:\n" + (
|
||||
"[No speech detected]" if transcript == "" else transcript
|
||||
)
|
||||
except Exception:
|
||||
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
||||
|
||||
finally:
|
||||
os.unlink(temp_path)
|
||||
|
||||
# Return the result
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content.strip(),
|
||||
)
|
||||
@@ -1,6 +1,7 @@
|
||||
import sys
|
||||
from typing import Any, Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from typing import Any, Union, BinaryIO
|
||||
from .._stream_info import StreamInfo
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
@@ -12,6 +13,12 @@ except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.ms-outlook",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".msg"]
|
||||
|
||||
|
||||
class OutlookMsgConverter(DocumentConverter):
|
||||
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
||||
@@ -21,19 +28,52 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
- Email body content
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
# Check the extension and mimetype
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Brute force, check if we have an OLE file
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
if not olefile.isOleFile(file_stream):
|
||||
return False
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Brue force, check if it's an Outlook file
|
||||
try:
|
||||
msg = olefile.OleFileIO(file_stream)
|
||||
toc = "\n".join([str(stream) for stream in msg.listdir()])
|
||||
return (
|
||||
"__properties_version1.0" in toc
|
||||
and "__recip_version1.0_#00000000" in toc
|
||||
)
|
||||
except Exception as e:
|
||||
pass
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a MSG file
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".msg":
|
||||
return None
|
||||
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
@@ -42,44 +82,41 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
extension=".msg",
|
||||
feature="outlook",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
try:
|
||||
msg = olefile.OleFileIO(local_path)
|
||||
# Extract email metadata
|
||||
md_content = "# Email Message\n\n"
|
||||
|
||||
# Get headers
|
||||
headers = {
|
||||
"From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
|
||||
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
|
||||
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
|
||||
}
|
||||
|
||||
# Add headers to markdown
|
||||
for key, value in headers.items():
|
||||
if value:
|
||||
md_content += f"**{key}:** {value}\n"
|
||||
|
||||
md_content += "\n## Content\n\n"
|
||||
|
||||
# Get email body
|
||||
body = self._get_stream_data(msg, "__substg1.0_1000001F")
|
||||
if body:
|
||||
md_content += body
|
||||
|
||||
msg.close()
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=headers.get("Subject"), text_content=md_content.strip()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise FileConversionException(
|
||||
f"Could not convert MSG file '{local_path}': {str(e)}"
|
||||
)
|
||||
msg = olefile.OleFileIO(file_stream)
|
||||
# Extract email metadata
|
||||
md_content = "# Email Message\n\n"
|
||||
|
||||
# Get headers
|
||||
headers = {
|
||||
"From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
|
||||
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
|
||||
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
|
||||
}
|
||||
|
||||
# Add headers to markdown
|
||||
for key, value in headers.items():
|
||||
if value:
|
||||
md_content += f"**{key}:** {value}\n"
|
||||
|
||||
md_content += "\n## Content\n\n"
|
||||
|
||||
# Get email body
|
||||
body = self._get_stream_data(msg, "__substg1.0_1000001F")
|
||||
if body:
|
||||
md_content += body
|
||||
|
||||
msg.close()
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_content.strip(),
|
||||
title=headers.get("Subject"),
|
||||
)
|
||||
|
||||
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||
|
||||
@@ -1,8 +1,15 @@
|
||||
import sys
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
import io
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
@@ -14,22 +21,43 @@ except ImportError:
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/pdf",
|
||||
"application/x-pdf",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
|
||||
|
||||
|
||||
class PdfConverter(DocumentConverter):
|
||||
"""
|
||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a PDF
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".pdf":
|
||||
return None
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
@@ -38,11 +66,13 @@ class PdfConverter(DocumentConverter):
|
||||
extension=".pdf",
|
||||
feature="pdf",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
assert isinstance(file_stream, io.IOBase) # for mypy
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=pdfminer.high_level.extract_text(local_path),
|
||||
markdown=pdfminer.high_level.extract_text(file_stream),
|
||||
)
|
||||
|
||||
@@ -1,13 +1,26 @@
|
||||
import mimetypes
|
||||
import sys
|
||||
|
||||
from charset_normalizer import from_path
|
||||
from typing import Any, Union
|
||||
from typing import BinaryIO, Any
|
||||
from charset_normalizer import from_bytes
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import mammoth
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/",
|
||||
"application/json",
|
||||
]
|
||||
|
||||
# Mimetypes to ignore (commonly confused extensions)
|
||||
IGNORE_MIMETYPES = [
|
||||
IGNORE_MIME_TYPE_PREFIXES = [
|
||||
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
|
||||
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
|
||||
]
|
||||
@@ -16,34 +29,34 @@ IGNORE_MIMETYPES = [
|
||||
class PlainTextConverter(DocumentConverter):
|
||||
"""Anything with content type text/plain"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
for prefix in IGNORE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return False
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Guess the content type from any file extension that might be around
|
||||
content_type, _ = mimetypes.guess_type(
|
||||
"__placeholder" + kwargs.get("file_extension", "")
|
||||
)
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
if stream_info.charset:
|
||||
text_content = file_stream.read().decode(stream_info.charset)
|
||||
else:
|
||||
text_content = str(from_bytes(file_stream.read()).best())
|
||||
|
||||
# Ignore common false positives
|
||||
if content_type in IGNORE_MIMETYPES:
|
||||
content_type = None
|
||||
|
||||
# Only accept text files
|
||||
if content_type is None:
|
||||
return None
|
||||
elif all(
|
||||
not content_type.lower().startswith(type_prefix)
|
||||
for type_prefix in ["text/", "application/json"]
|
||||
):
|
||||
return None
|
||||
|
||||
text_content = str(from_path(local_path).best())
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=text_content,
|
||||
)
|
||||
return DocumentConverterResult(markdown=text_content)
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
import sys
|
||||
import base64
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
import html
|
||||
import sys
|
||||
|
||||
from typing import Union
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
from ._base import DocumentConverterResult, DocumentConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._llm_caption import llm_caption
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
@@ -19,51 +23,46 @@ except ImportError:
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class PptxConverter(HtmlConverter):
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".pptx"]
|
||||
|
||||
|
||||
class PptxConverter(DocumentConverter):
|
||||
"""
|
||||
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def _get_llm_description(
|
||||
self, llm_client, llm_model, image_blob, content_type, prompt=None
|
||||
):
|
||||
if prompt is None or prompt.strip() == "":
|
||||
prompt = "Write a detailed alt text for this image with less than 50 words."
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
image_base64 = base64.b64encode(image_blob).decode("utf-8")
|
||||
data_uri = f"data:{content_type};base64,{image_base64}"
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": data_uri,
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": prompt},
|
||||
],
|
||||
}
|
||||
]
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
response = llm_client.chat.completions.create(
|
||||
model=llm_model, messages=messages
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a PPTX
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".pptx":
|
||||
return None
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
@@ -72,11 +71,14 @@ class PptxConverter(HtmlConverter):
|
||||
extension=".pptx",
|
||||
feature="pptx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
presentation = pptx.Presentation(local_path)
|
||||
# Perform the conversion
|
||||
presentation = pptx.Presentation(file_stream)
|
||||
md_content = ""
|
||||
slide_num = 0
|
||||
for slide in presentation.slides:
|
||||
@@ -92,59 +94,58 @@ class PptxConverter(HtmlConverter):
|
||||
if self._is_picture(shape):
|
||||
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
|
||||
|
||||
llm_description = None
|
||||
alt_text = None
|
||||
llm_description = ""
|
||||
alt_text = ""
|
||||
|
||||
# Potentially generate a description using an LLM
|
||||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
if llm_client is not None and llm_model is not None:
|
||||
# Prepare a file_stream and stream_info for the image data
|
||||
image_filename = shape.image.filename
|
||||
image_extension = None
|
||||
if image_filename:
|
||||
image_extension = os.path.splitext(image_filename)[1]
|
||||
image_stream_info = StreamInfo(
|
||||
mimetype=shape.image.content_type,
|
||||
extension=image_extension,
|
||||
filename=image_filename,
|
||||
)
|
||||
|
||||
image_stream = io.BytesIO(shape.image.blob)
|
||||
|
||||
# Caption the image
|
||||
try:
|
||||
llm_description = self._get_llm_description(
|
||||
llm_client,
|
||||
llm_model,
|
||||
shape.image.blob,
|
||||
shape.image.content_type,
|
||||
llm_description = llm_caption(
|
||||
image_stream,
|
||||
image_stream_info,
|
||||
client=llm_client,
|
||||
model=llm_model,
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
)
|
||||
except Exception:
|
||||
# Unable to describe with LLM
|
||||
# Unable to generate a description
|
||||
pass
|
||||
|
||||
if not llm_description:
|
||||
try:
|
||||
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
|
||||
"descr", ""
|
||||
)
|
||||
except Exception:
|
||||
# Unable to get alt text
|
||||
pass
|
||||
# Also grab any description embedded in the deck
|
||||
try:
|
||||
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
|
||||
except Exception:
|
||||
# Unable to get alt text
|
||||
pass
|
||||
|
||||
# Prepare the alt, escaping any special characters
|
||||
alt_text = "\n".join([llm_description, alt_text]) or shape.name
|
||||
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
|
||||
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
||||
|
||||
# A placeholder name
|
||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||
md_content += (
|
||||
"\n\n"
|
||||
)
|
||||
md_content += "\n\n"
|
||||
|
||||
# Tables
|
||||
if self._is_table(shape):
|
||||
html_table = "<html><body><table>"
|
||||
first_row = True
|
||||
for row in shape.table.rows:
|
||||
html_table += "<tr>"
|
||||
for cell in row.cells:
|
||||
if first_row:
|
||||
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
||||
else:
|
||||
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
||||
html_table += "</tr>"
|
||||
first_row = False
|
||||
html_table += "</table></body></html>"
|
||||
md_content += (
|
||||
"\n" + self._convert(html_table).text_content.strip() + "\n"
|
||||
)
|
||||
md_content += self._convert_table_to_markdown(shape.table)
|
||||
|
||||
# Charts
|
||||
if shape.has_chart:
|
||||
@@ -174,10 +175,7 @@ class PptxConverter(HtmlConverter):
|
||||
md_content += notes_frame.text
|
||||
md_content = md_content.strip()
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content.strip(),
|
||||
)
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
def _is_picture(self, shape):
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
||||
@@ -192,6 +190,23 @@ class PptxConverter(HtmlConverter):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _convert_table_to_markdown(self, table):
|
||||
# Write the table as HTML, then convert it to Markdown
|
||||
html_table = "<html><body><table>"
|
||||
first_row = True
|
||||
for row in table.rows:
|
||||
html_table += "<tr>"
|
||||
for cell in row.cells:
|
||||
if first_row:
|
||||
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
||||
else:
|
||||
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
||||
html_table += "</tr>"
|
||||
first_row = False
|
||||
html_table += "</table></body></html>"
|
||||
|
||||
return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
|
||||
|
||||
def _convert_chart_to_markdown(self, chart):
|
||||
md = "\n\n### Chart"
|
||||
if chart.has_title:
|
||||
|
||||
@@ -1,128 +1,165 @@
|
||||
from xml.dom import minidom
|
||||
from typing import Union
|
||||
from typing import BinaryIO, Any, Union
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
PRECISE_MIME_TYPE_PREFIXES = [
|
||||
"application/rss",
|
||||
"application/atom",
|
||||
]
|
||||
|
||||
PRECISE_FILE_EXTENSIONS = [".rss", ".atom"]
|
||||
|
||||
CANDIDATE_MIME_TYPE_PREFIXES = [
|
||||
"text/xml",
|
||||
"application/xml",
|
||||
]
|
||||
|
||||
CANDIDATE_FILE_EXTENSIONS = [
|
||||
".xml",
|
||||
]
|
||||
|
||||
|
||||
class RssConverter(DocumentConverter):
|
||||
"""Convert RSS / Atom type to markdown"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not RSS type
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".xml", ".rss", ".atom"]:
|
||||
return None
|
||||
# Check for precise mimetypes and file extensions
|
||||
if extension in PRECISE_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in PRECISE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Check for precise mimetypes and file extensions
|
||||
if extension in CANDIDATE_FILE_EXTENSIONS:
|
||||
return self._check_xml(file_stream)
|
||||
|
||||
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return self._check_xml(file_stream)
|
||||
|
||||
return False
|
||||
|
||||
def _check_xml(self, file_stream: BinaryIO) -> bool:
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
doc = minidom.parse(local_path)
|
||||
doc = minidom.parse(file_stream)
|
||||
return self._feed_type(doc) is not None
|
||||
except BaseException as _:
|
||||
return None
|
||||
result = None
|
||||
pass
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
return False
|
||||
|
||||
def _feed_type(self, doc: Any) -> str:
|
||||
if doc.getElementsByTagName("rss"):
|
||||
# A RSS feed must have a root element of <rss>
|
||||
result = self._parse_rss_type(doc)
|
||||
return "rss"
|
||||
elif doc.getElementsByTagName("feed"):
|
||||
root = doc.getElementsByTagName("feed")[0]
|
||||
if root.getElementsByTagName("entry"):
|
||||
# An Atom feed must have a root element of <feed> and at least one <entry>
|
||||
result = self._parse_atom_type(doc)
|
||||
else:
|
||||
return None
|
||||
return "atom"
|
||||
return None
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
doc = minidom.parse(file_stream)
|
||||
feed_type = self._feed_type(doc)
|
||||
|
||||
if feed_type == "rss":
|
||||
return self._parse_rss_type(doc)
|
||||
elif feed_type == "atom":
|
||||
return self._parse_atom_type(doc)
|
||||
else:
|
||||
# not rss or atom
|
||||
return None
|
||||
raise ValueError("Unknown feed type")
|
||||
|
||||
return result
|
||||
|
||||
def _parse_atom_type(
|
||||
self, doc: minidom.Document
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult:
|
||||
"""Parse the type of an Atom feed.
|
||||
|
||||
Returns None if the feed type is not recognized or something goes wrong.
|
||||
"""
|
||||
try:
|
||||
root = doc.getElementsByTagName("feed")[0]
|
||||
title = self._get_data_by_tag_name(root, "title")
|
||||
subtitle = self._get_data_by_tag_name(root, "subtitle")
|
||||
entries = root.getElementsByTagName("entry")
|
||||
md_text = f"# {title}\n"
|
||||
if subtitle:
|
||||
md_text += f"{subtitle}\n"
|
||||
for entry in entries:
|
||||
entry_title = self._get_data_by_tag_name(entry, "title")
|
||||
entry_summary = self._get_data_by_tag_name(entry, "summary")
|
||||
entry_updated = self._get_data_by_tag_name(entry, "updated")
|
||||
entry_content = self._get_data_by_tag_name(entry, "content")
|
||||
root = doc.getElementsByTagName("feed")[0]
|
||||
title = self._get_data_by_tag_name(root, "title")
|
||||
subtitle = self._get_data_by_tag_name(root, "subtitle")
|
||||
entries = root.getElementsByTagName("entry")
|
||||
md_text = f"# {title}\n"
|
||||
if subtitle:
|
||||
md_text += f"{subtitle}\n"
|
||||
for entry in entries:
|
||||
entry_title = self._get_data_by_tag_name(entry, "title")
|
||||
entry_summary = self._get_data_by_tag_name(entry, "summary")
|
||||
entry_updated = self._get_data_by_tag_name(entry, "updated")
|
||||
entry_content = self._get_data_by_tag_name(entry, "content")
|
||||
|
||||
if entry_title:
|
||||
md_text += f"\n## {entry_title}\n"
|
||||
if entry_updated:
|
||||
md_text += f"Updated on: {entry_updated}\n"
|
||||
if entry_summary:
|
||||
md_text += self._parse_content(entry_summary)
|
||||
if entry_content:
|
||||
md_text += self._parse_content(entry_content)
|
||||
if entry_title:
|
||||
md_text += f"\n## {entry_title}\n"
|
||||
if entry_updated:
|
||||
md_text += f"Updated on: {entry_updated}\n"
|
||||
if entry_summary:
|
||||
md_text += self._parse_content(entry_summary)
|
||||
if entry_content:
|
||||
md_text += self._parse_content(entry_content)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=title,
|
||||
text_content=md_text,
|
||||
)
|
||||
except BaseException as _:
|
||||
return None
|
||||
return DocumentConverterResult(
|
||||
markdown=md_text,
|
||||
title=title,
|
||||
)
|
||||
|
||||
def _parse_rss_type(
|
||||
self, doc: minidom.Document
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult:
|
||||
"""Parse the type of an RSS feed.
|
||||
|
||||
Returns None if the feed type is not recognized or something goes wrong.
|
||||
"""
|
||||
try:
|
||||
root = doc.getElementsByTagName("rss")[0]
|
||||
channel = root.getElementsByTagName("channel")
|
||||
if not channel:
|
||||
return None
|
||||
channel = channel[0]
|
||||
channel_title = self._get_data_by_tag_name(channel, "title")
|
||||
channel_description = self._get_data_by_tag_name(channel, "description")
|
||||
items = channel.getElementsByTagName("item")
|
||||
if channel_title:
|
||||
md_text = f"# {channel_title}\n"
|
||||
if channel_description:
|
||||
md_text += f"{channel_description}\n"
|
||||
if not items:
|
||||
items = []
|
||||
for item in items:
|
||||
title = self._get_data_by_tag_name(item, "title")
|
||||
description = self._get_data_by_tag_name(item, "description")
|
||||
pubDate = self._get_data_by_tag_name(item, "pubDate")
|
||||
content = self._get_data_by_tag_name(item, "content:encoded")
|
||||
|
||||
if title:
|
||||
md_text += f"\n## {title}\n"
|
||||
if pubDate:
|
||||
md_text += f"Published on: {pubDate}\n"
|
||||
if description:
|
||||
md_text += self._parse_content(description)
|
||||
if content:
|
||||
md_text += self._parse_content(content)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=channel_title,
|
||||
text_content=md_text,
|
||||
)
|
||||
except BaseException as _:
|
||||
print(traceback.format_exc())
|
||||
root = doc.getElementsByTagName("rss")[0]
|
||||
channel = root.getElementsByTagName("channel")
|
||||
if not channel:
|
||||
return None
|
||||
channel = channel[0]
|
||||
channel_title = self._get_data_by_tag_name(channel, "title")
|
||||
channel_description = self._get_data_by_tag_name(channel, "description")
|
||||
items = channel.getElementsByTagName("item")
|
||||
if channel_title:
|
||||
md_text = f"# {channel_title}\n"
|
||||
if channel_description:
|
||||
md_text += f"{channel_description}\n"
|
||||
if not items:
|
||||
items = []
|
||||
for item in items:
|
||||
title = self._get_data_by_tag_name(item, "title")
|
||||
description = self._get_data_by_tag_name(item, "description")
|
||||
pubDate = self._get_data_by_tag_name(item, "pubDate")
|
||||
content = self._get_data_by_tag_name(item, "content:encoded")
|
||||
|
||||
if title:
|
||||
md_text += f"\n## {title}\n"
|
||||
if pubDate:
|
||||
md_text += f"Published on: {pubDate}\n"
|
||||
if description:
|
||||
md_text += self._parse_content(description)
|
||||
if content:
|
||||
md_text += self._parse_content(content)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_text,
|
||||
title=channel_title,
|
||||
)
|
||||
|
||||
def _parse_content(self, content: str) -> str:
|
||||
"""Parse the content of an RSS feed item"""
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
import io
|
||||
import sys
|
||||
from typing import BinaryIO
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import speech_recognition as sr
|
||||
import pydub
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str:
|
||||
# Check for installed dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
if audio_format in ["wav", "aiff", "flac"]:
|
||||
audio_source = file_stream
|
||||
elif audio_format in ["mp3", "mp4"]:
|
||||
audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format)
|
||||
|
||||
audio_source = io.BytesIO()
|
||||
audio_segment.export(audio_source, format="wav")
|
||||
audio_source.seek(0)
|
||||
else:
|
||||
raise ValueError(f"Unsupported audio format: {audio_format}")
|
||||
|
||||
recognizer = sr.Recognizer()
|
||||
with sr.AudioFile(audio_source) as source:
|
||||
audio = recognizer.record(source)
|
||||
transcript = recognizer.recognize_google(audio).strip()
|
||||
return "[No speech detected]" if transcript == "" else transcript
|
||||
@@ -1,72 +0,0 @@
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._media_converter import MediaConverter
|
||||
|
||||
# Optional Transcription support
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||
try:
|
||||
import speech_recognition as sr
|
||||
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
class WavConverter(MediaConverter):
|
||||
"""
|
||||
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a WAV
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".wav":
|
||||
return None
|
||||
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
||||
if metadata:
|
||||
for f in [
|
||||
"Title",
|
||||
"Artist",
|
||||
"Author",
|
||||
"Band",
|
||||
"Album",
|
||||
"Genre",
|
||||
"Track",
|
||||
"DateTimeOriginal",
|
||||
"CreateDate",
|
||||
"Duration",
|
||||
]:
|
||||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Transcribe
|
||||
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
||||
try:
|
||||
transcript = self._transcribe_audio(local_path)
|
||||
md_content += "\n\n### Audio Transcript:\n" + (
|
||||
"[No speech detected]" if transcript == "" else transcript
|
||||
)
|
||||
except Exception:
|
||||
md_content += (
|
||||
"\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content.strip(),
|
||||
)
|
||||
|
||||
def _transcribe_audio(self, local_path) -> str:
|
||||
recognizer = sr.Recognizer()
|
||||
with sr.AudioFile(local_path) as source:
|
||||
audio = recognizer.record(source)
|
||||
return recognizer.recognize_google(audio).strip()
|
||||
@@ -1,35 +1,63 @@
|
||||
import io
|
||||
import re
|
||||
|
||||
from typing import Any, Union
|
||||
from typing import Any, BinaryIO, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
|
||||
class WikipediaConverter(DocumentConverter):
|
||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Make sure we're dealing with HTML content *from* Wikipedia.
|
||||
"""
|
||||
|
||||
url = stream_info.url or ""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
|
||||
# Not a Wikipedia URL
|
||||
return False
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Not HTML content
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not Wikipedia
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".html", ".htm"]:
|
||||
return None
|
||||
url = kwargs.get("url", "")
|
||||
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
|
||||
return None
|
||||
|
||||
# Parse the file
|
||||
soup = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Remove javascript and style blocks
|
||||
for script in soup(["script", "style"]):
|
||||
@@ -56,6 +84,6 @@ class WikipediaConverter(DocumentConverter):
|
||||
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=main_title,
|
||||
text_content=webpage_text,
|
||||
)
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
import sys
|
||||
|
||||
from typing import Union
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from typing import BinaryIO, Any
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
@@ -22,23 +21,51 @@ try:
|
||||
except ImportError:
|
||||
_xls_dependency_exc_info = sys.exc_info()
|
||||
|
||||
ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
]
|
||||
ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
|
||||
|
||||
class XlsxConverter(HtmlConverter):
|
||||
ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.ms-excel",
|
||||
"application/excel",
|
||||
]
|
||||
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
|
||||
|
||||
|
||||
class XlsxConverter(DocumentConverter):
|
||||
"""
|
||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a XLSX
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".xlsx":
|
||||
return None
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check the dependencies
|
||||
if _xlsx_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
@@ -47,34 +74,58 @@ class XlsxConverter(HtmlConverter):
|
||||
extension=".xlsx",
|
||||
feature="xlsx",
|
||||
)
|
||||
) from _xlsx_dependency_exc_info[1].with_traceback(
|
||||
) from _xlsx_dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_xlsx_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
html_content = sheets[s].to_html(index=False)
|
||||
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
||||
md_content += (
|
||||
self._html_converter.convert_string(html_content).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content.strip(),
|
||||
)
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
|
||||
class XlsConverter(HtmlConverter):
|
||||
class XlsConverter(DocumentConverter):
|
||||
"""
|
||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a XLS
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".xls":
|
||||
return None
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Load the dependencies
|
||||
if _xls_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
@@ -83,18 +134,20 @@ class XlsConverter(HtmlConverter):
|
||||
extension=".xls",
|
||||
feature="xls",
|
||||
)
|
||||
) from _xls_dependency_exc_info[1].with_traceback(
|
||||
) from _xls_dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_xls_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
html_content = sheets[s].to_html(index=False)
|
||||
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
||||
md_content += (
|
||||
self._html_converter.convert_string(html_content).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content.strip(),
|
||||
)
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import urllib.parse
|
||||
import time
|
||||
|
||||
from typing import Any, Union, Dict, List
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
import io
|
||||
import re
|
||||
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
||||
from urllib.parse import parse_qs, urlparse, unquote
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
# Optional YouTube transcription support
|
||||
try:
|
||||
@@ -19,53 +20,59 @@ except ModuleNotFoundError:
|
||||
IS_YOUTUBE_TRANSCRIPT_CAPABLE = False
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
|
||||
class YouTubeConverter(DocumentConverter):
|
||||
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Make sure we're dealing with HTML content *from* YouTube.
|
||||
"""
|
||||
url = stream_info.url or ""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
def retry_operation(self, operation, retries=3, delay=2):
|
||||
"""Retries the operation if it fails."""
|
||||
attempt = 0
|
||||
while attempt < retries:
|
||||
try:
|
||||
return operation() # Attempt the operation
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt + 1} failed: {e}")
|
||||
if attempt < retries - 1:
|
||||
time.sleep(delay) # Wait before retrying
|
||||
attempt += 1
|
||||
# If all attempts fail, raise the last exception
|
||||
raise Exception(f"Operation failed after {retries} attempts.")
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not YouTube
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".html", ".htm"]:
|
||||
return None
|
||||
url = kwargs.get("url", "")
|
||||
|
||||
url = urllib.parse.unquote(url)
|
||||
url = unquote(url)
|
||||
url = url.replace(r"\?", "?").replace(r"\=", "=")
|
||||
|
||||
if not url.startswith("https://www.youtube.com/watch?"):
|
||||
return None
|
||||
# Not a YouTube URL
|
||||
return False
|
||||
|
||||
# Parse the file with error handling
|
||||
try:
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
||||
except Exception as e:
|
||||
print(f"Error reading YouTube page: {e}")
|
||||
return None
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
if not soup.title or not soup.title.string:
|
||||
return None
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Not HTML content
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Read the meta tags
|
||||
metadata: Dict[str, str] = {"title": soup.title.string}
|
||||
@@ -126,7 +133,7 @@ class YouTubeConverter(DocumentConverter):
|
||||
|
||||
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
||||
transcript_text = ""
|
||||
parsed_url = urlparse(url) # type: ignore
|
||||
parsed_url = urlparse(stream_info.url) # type: ignore
|
||||
params = parse_qs(parsed_url.query) # type: ignore
|
||||
if "v" in params and params["v"][0]:
|
||||
video_id = str(params["v"][0])
|
||||
@@ -135,7 +142,7 @@ class YouTubeConverter(DocumentConverter):
|
||||
"youtube_transcript_languages", ("en",)
|
||||
)
|
||||
# Retry the transcript fetching operation
|
||||
transcript = self.retry_operation(
|
||||
transcript = self._retry_operation(
|
||||
lambda: YouTubeTranscriptApi.get_transcript(
|
||||
video_id, languages=youtube_transcript_languages
|
||||
),
|
||||
@@ -158,8 +165,8 @@ class YouTubeConverter(DocumentConverter):
|
||||
assert isinstance(title, str)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=title,
|
||||
text_content=webpage_text,
|
||||
)
|
||||
|
||||
def _get(
|
||||
@@ -188,3 +195,17 @@ class YouTubeConverter(DocumentConverter):
|
||||
if result := self._findKey(v, key):
|
||||
return result
|
||||
return None
|
||||
|
||||
def _retry_operation(self, operation, retries=3, delay=2):
|
||||
"""Retries the operation if it fails."""
|
||||
attempt = 0
|
||||
while attempt < retries:
|
||||
try:
|
||||
return operation() # Attempt the operation
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt + 1} failed: {e}")
|
||||
if attempt < retries - 1:
|
||||
time.sleep(delay) # Wait before retrying
|
||||
attempt += 1
|
||||
# If all attempts fail, raise the last exception
|
||||
raise Exception(f"Operation failed after {retries} attempts.")
|
||||
|
||||
@@ -1,9 +1,23 @@
|
||||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
import shutil
|
||||
from typing import Any, Union
|
||||
import io
|
||||
import os
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from typing import BinaryIO, Any, TYPE_CHECKING
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import UnsupportedFormatException, FileConversionException
|
||||
|
||||
# Break otherwise circular import for type hinting
|
||||
if TYPE_CHECKING:
|
||||
from .._markitdown import MarkItDown
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/zip",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".zip"]
|
||||
|
||||
|
||||
class ZipConverter(DocumentConverter):
|
||||
@@ -46,99 +60,58 @@ class ZipConverter(DocumentConverter):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
self,
|
||||
*,
|
||||
markitdown: "MarkItDown",
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
super().__init__()
|
||||
self._markitdown = markitdown
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a ZIP
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".zip":
|
||||
return None
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
file_path = stream_info.url or stream_info.local_path or stream_info.filename
|
||||
md_content = f"Content from the zip file `{file_path}`:\n\n"
|
||||
|
||||
# Get parent converters list if available
|
||||
parent_converters = kwargs.get("_parent_converters", [])
|
||||
if not parent_converters:
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
|
||||
)
|
||||
with zipfile.ZipFile(file_stream, "r") as zipObj:
|
||||
for name in zipObj.namelist():
|
||||
try:
|
||||
z_file_stream = io.BytesIO(zipObj.read(name))
|
||||
z_file_stream_info = StreamInfo(
|
||||
extension=os.path.splitext(name)[1],
|
||||
filename=os.path.basename(name),
|
||||
)
|
||||
result = self._markitdown.convert_stream(
|
||||
stream=z_file_stream,
|
||||
stream_info=z_file_stream_info,
|
||||
)
|
||||
if result is not None:
|
||||
md_content += f"## File: {name}\n\n"
|
||||
md_content += result.markdown + "\n\n"
|
||||
except UnsupportedFormatException:
|
||||
pass
|
||||
except FileConversionException:
|
||||
pass
|
||||
|
||||
extracted_zip_folder_name = (
|
||||
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
|
||||
)
|
||||
extraction_dir = os.path.normpath(
|
||||
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
|
||||
)
|
||||
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
|
||||
|
||||
try:
|
||||
# Extract the zip file safely
|
||||
with zipfile.ZipFile(local_path, "r") as zipObj:
|
||||
# Bail if we discover it's an Office OOXML file
|
||||
if "[Content_Types].xml" in zipObj.namelist():
|
||||
return None
|
||||
|
||||
# Safeguard against path traversal
|
||||
for member in zipObj.namelist():
|
||||
member_path = os.path.normpath(os.path.join(extraction_dir, member))
|
||||
if (
|
||||
not os.path.commonprefix([extraction_dir, member_path])
|
||||
== extraction_dir
|
||||
):
|
||||
raise ValueError(
|
||||
f"Path traversal detected in zip file: {member}"
|
||||
)
|
||||
|
||||
# Extract all files safely
|
||||
zipObj.extractall(path=extraction_dir)
|
||||
|
||||
# Process each extracted file
|
||||
for root, dirs, files in os.walk(extraction_dir):
|
||||
for name in files:
|
||||
file_path = os.path.join(root, name)
|
||||
relative_path = os.path.relpath(file_path, extraction_dir)
|
||||
|
||||
# Get file extension
|
||||
_, file_extension = os.path.splitext(name)
|
||||
|
||||
# Update kwargs for the file
|
||||
file_kwargs = kwargs.copy()
|
||||
file_kwargs["file_extension"] = file_extension
|
||||
file_kwargs["_parent_converters"] = parent_converters
|
||||
|
||||
# Try converting the file using available converters
|
||||
for converter in parent_converters:
|
||||
# Skip the zip converter to avoid infinite recursion
|
||||
if isinstance(converter, ZipConverter):
|
||||
continue
|
||||
|
||||
result = converter.convert(file_path, **file_kwargs)
|
||||
if result is not None:
|
||||
md_content += f"\n## File: {relative_path}\n\n"
|
||||
md_content += result.text_content + "\n\n"
|
||||
break
|
||||
|
||||
# Clean up extracted files if specified
|
||||
if kwargs.get("cleanup_extracted", True):
|
||||
shutil.rmtree(extraction_dir)
|
||||
|
||||
return DocumentConverterResult(title=None, text_content=md_content.strip())
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
|
||||
)
|
||||
except ValueError as ve:
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
|
||||
)
|
||||
except Exception as e:
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
|
||||
)
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
@@ -7,7 +7,7 @@ from markitdown import __version__
|
||||
try:
|
||||
from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
|
||||
except ImportError:
|
||||
from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
|
||||
from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS # type: ignore
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
||||
BIN
packages/markitdown/tests/test_files/test.m4a
vendored
Executable file
BIN
packages/markitdown/tests/test_files/test.m4a
vendored
Executable file
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.mp3
vendored
Normal file
BIN
packages/markitdown/tests/test_files/test.mp3
vendored
Normal file
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.pdf
vendored
Normal file
BIN
packages/markitdown/tests/test_files/test.pdf
vendored
Normal file
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.pptx
vendored
BIN
packages/markitdown/tests/test_files/test.pptx
vendored
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.wav
vendored
Normal file
BIN
packages/markitdown/tests/test_files/test.wav
vendored
Normal file
Binary file not shown.
@@ -1,89 +1,89 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f61db80",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Test Notebook"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "3f2a5bbd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"markitdown\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print('markitdown')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9b9c0468",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Code Cell Below"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "37d8088a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"42\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# comment in code\n",
|
||||
"print(42)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2e3177bd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"End\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.8"
|
||||
},
|
||||
"title": "Test Notebook Title"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f61db80",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Test Notebook"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "3f2a5bbd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"markitdown\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"markitdown\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9b9c0468",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Code Cell Below"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "37d8088a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"42\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# comment in code\n",
|
||||
"print(42)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2e3177bd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"End\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.8"
|
||||
},
|
||||
"title": "Test Notebook Title"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
||||
@@ -2,13 +2,20 @@
|
||||
import io
|
||||
import os
|
||||
import shutil
|
||||
import openai
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from warnings import catch_warnings, resetwarnings
|
||||
import warnings
|
||||
|
||||
from markitdown import MarkItDown, UnsupportedFormatException, FileConversionException
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
UnsupportedFormatException,
|
||||
FileConversionException,
|
||||
StreamInfo,
|
||||
)
|
||||
from markitdown._stream_info import _guess_stream_info_from_stream
|
||||
|
||||
skip_remote = (
|
||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||
@@ -35,6 +42,13 @@ JPG_TEST_EXIFTOOL = {
|
||||
"DateTimeOriginal": "2024:03:14 22:10:00",
|
||||
}
|
||||
|
||||
MP3_TEST_EXIFTOOL = {
|
||||
"Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e",
|
||||
"Artist": "Artist Name Test String",
|
||||
"Album": "Album Name Test String",
|
||||
"SampleRate": "48000",
|
||||
}
|
||||
|
||||
PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
|
||||
PDF_TEST_STRINGS = [
|
||||
"While there is contemporaneous exploration of multi-agent approaches"
|
||||
@@ -162,6 +176,107 @@ def validate_strings(result, expected_strings, exclude_strings=None):
|
||||
assert string not in text_content
|
||||
|
||||
|
||||
def test_stream_info_operations() -> None:
|
||||
"""Test operations performed on StreamInfo objects."""
|
||||
|
||||
stream_info_original = StreamInfo(
|
||||
mimetype="mimetype.1",
|
||||
extension="extension.1",
|
||||
charset="charset.1",
|
||||
filename="filename.1",
|
||||
local_path="local_path.1",
|
||||
url="url.1",
|
||||
)
|
||||
|
||||
# Check updating all attributes by keyword
|
||||
keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
|
||||
for keyword in keywords:
|
||||
updated_stream_info = stream_info_original.copy_and_update(
|
||||
**{keyword: f"{keyword}.2"}
|
||||
)
|
||||
|
||||
# Make sure the targted attribute is updated
|
||||
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
|
||||
|
||||
# Make sure the other attributes are unchanged
|
||||
for k in keywords:
|
||||
if k != keyword:
|
||||
assert getattr(stream_info_original, k) == getattr(
|
||||
updated_stream_info, k
|
||||
)
|
||||
|
||||
# Check updating all attributes by passing a new StreamInfo object
|
||||
keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
|
||||
for keyword in keywords:
|
||||
updated_stream_info = stream_info_original.copy_and_update(
|
||||
StreamInfo(**{keyword: f"{keyword}.2"})
|
||||
)
|
||||
|
||||
# Make sure the targted attribute is updated
|
||||
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
|
||||
|
||||
# Make sure the other attributes are unchanged
|
||||
for k in keywords:
|
||||
if k != keyword:
|
||||
assert getattr(stream_info_original, k) == getattr(
|
||||
updated_stream_info, k
|
||||
)
|
||||
|
||||
# Check mixing and matching
|
||||
updated_stream_info = stream_info_original.copy_and_update(
|
||||
StreamInfo(extension="extension.2", filename="filename.2"),
|
||||
mimetype="mimetype.3",
|
||||
charset="charset.3",
|
||||
)
|
||||
assert updated_stream_info.extension == "extension.2"
|
||||
assert updated_stream_info.filename == "filename.2"
|
||||
assert updated_stream_info.mimetype == "mimetype.3"
|
||||
assert updated_stream_info.charset == "charset.3"
|
||||
assert updated_stream_info.local_path == "local_path.1"
|
||||
assert updated_stream_info.url == "url.1"
|
||||
|
||||
# Check multiple StreamInfo objects
|
||||
updated_stream_info = stream_info_original.copy_and_update(
|
||||
StreamInfo(extension="extension.4", filename="filename.5"),
|
||||
StreamInfo(mimetype="mimetype.6", charset="charset.7"),
|
||||
)
|
||||
assert updated_stream_info.extension == "extension.4"
|
||||
assert updated_stream_info.filename == "filename.5"
|
||||
assert updated_stream_info.mimetype == "mimetype.6"
|
||||
assert updated_stream_info.charset == "charset.7"
|
||||
assert updated_stream_info.local_path == "local_path.1"
|
||||
assert updated_stream_info.url == "url.1"
|
||||
|
||||
|
||||
def test_stream_info_guesses() -> None:
|
||||
"""Test StreamInfo guesses based on stream content."""
|
||||
|
||||
test_tuples = [
|
||||
(
|
||||
os.path.join(TEST_FILES_DIR, "test.xlsx"),
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
),
|
||||
(
|
||||
os.path.join(TEST_FILES_DIR, "test.docx"),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
),
|
||||
(
|
||||
os.path.join(TEST_FILES_DIR, "test.pptx"),
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
),
|
||||
(os.path.join(TEST_FILES_DIR, "test.xls"), "application/vnd.ms-excel"),
|
||||
]
|
||||
|
||||
for file_path, expected_mimetype in test_tuples:
|
||||
with open(file_path, "rb") as f:
|
||||
guesses = _guess_stream_info_from_stream(
|
||||
f, filename_hint=os.path.basename(file_path)
|
||||
)
|
||||
assert len(guesses) > 0
|
||||
assert guesses[0].mimetype == expected_mimetype
|
||||
assert guesses[0].extension == os.path.splitext(file_path)[1]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
reason="do not run tests that query external urls",
|
||||
@@ -183,7 +298,6 @@ def test_markitdown_remote() -> None:
|
||||
assert test_string in result.text_content
|
||||
|
||||
# Youtube
|
||||
# TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
|
||||
result = markitdown.convert(YOUTUBE_TEST_URL)
|
||||
for test_string in YOUTUBE_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
@@ -192,6 +306,10 @@ def test_markitdown_remote() -> None:
|
||||
def test_markitdown_local() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test PDF processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pdf"))
|
||||
validate_strings(result, PDF_TEST_STRINGS)
|
||||
|
||||
# Test XLSX processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
||||
validate_strings(result, XLSX_TEST_STRINGS)
|
||||
@@ -230,10 +348,6 @@ def test_markitdown_local() -> None:
|
||||
)
|
||||
validate_strings(result, BLOG_TEST_STRINGS)
|
||||
|
||||
# Test ZIP file processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
|
||||
validate_strings(result, XLSX_TEST_STRINGS)
|
||||
|
||||
# Test Wikipedia processing
|
||||
result = markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
|
||||
@@ -254,24 +368,135 @@ def test_markitdown_local() -> None:
|
||||
for test_string in RSS_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
|
||||
## Test non-UTF-8 encoding
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
||||
validate_strings(result, CSV_CP932_TEST_STRINGS)
|
||||
|
||||
# Test MSG (Outlook email) processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
|
||||
validate_strings(result, MSG_TEST_STRINGS)
|
||||
|
||||
# Test non-UTF-8 encoding
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
||||
validate_strings(result, CSV_CP932_TEST_STRINGS)
|
||||
|
||||
# Test JSON processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
|
||||
validate_strings(result, JSON_TEST_STRINGS)
|
||||
|
||||
# # Test ZIP file processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
|
||||
validate_strings(result, DOCX_TEST_STRINGS)
|
||||
validate_strings(result, XLSX_TEST_STRINGS)
|
||||
validate_strings(result, BLOG_TEST_STRINGS)
|
||||
|
||||
# Test input from a stream
|
||||
input_data = b"<html><body><h1>Test</h1></body></html>"
|
||||
result = markitdown.convert_stream(io.BytesIO(input_data))
|
||||
assert "# Test" in result.text_content
|
||||
|
||||
# Test input with leading blank characters
|
||||
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
|
||||
result = markitdown.convert_stream(io.BytesIO(input_data))
|
||||
assert "# Test" in result.text_content
|
||||
|
||||
|
||||
def test_markitdown_streams() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test PDF processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test.pdf"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".pdf")
|
||||
validate_strings(result, PDF_TEST_STRINGS)
|
||||
|
||||
# Test XLSX processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".xlsx")
|
||||
validate_strings(result, XLSX_TEST_STRINGS)
|
||||
|
||||
# Test XLS processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".xls")
|
||||
for test_string in XLS_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test DOCX processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".docx")
|
||||
validate_strings(result, DOCX_TEST_STRINGS)
|
||||
|
||||
# Test DOCX processing, with comments
|
||||
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
|
||||
result = markitdown.convert(
|
||||
f,
|
||||
file_extension=".docx",
|
||||
style_map="comment-reference => ",
|
||||
)
|
||||
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||
|
||||
# Test DOCX processing, with comments and setting style_map on init
|
||||
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
||||
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
|
||||
result = markitdown_with_style_map.convert(f, file_extension=".docx")
|
||||
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||
|
||||
# Test PPTX processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".pptx")
|
||||
validate_strings(result, PPTX_TEST_STRINGS)
|
||||
|
||||
# Test HTML processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test_blog.html"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL)
|
||||
validate_strings(result, BLOG_TEST_STRINGS)
|
||||
|
||||
# Test Wikipedia processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL)
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
|
||||
|
||||
# Test Bing processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test_serp.html"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL)
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
|
||||
|
||||
# Test RSS processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".xml")
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for test_string in RSS_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
|
||||
# Test MSG (Outlook email) processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".msg")
|
||||
validate_strings(result, MSG_TEST_STRINGS)
|
||||
|
||||
# Test JSON processing
|
||||
with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f:
|
||||
result = markitdown.convert(f, file_extension=".json")
|
||||
validate_strings(result, JSON_TEST_STRINGS)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
reason="do not run remotely run speech transcription tests",
|
||||
)
|
||||
def test_speech_transcription() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test WAV files, MP3 and M4A files
|
||||
for file_name in ["test.wav", "test.mp3", "test.m4a"]:
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name))
|
||||
result_lower = result.text_content.lower()
|
||||
assert (
|
||||
("1" in result_lower or "one" in result_lower)
|
||||
and ("2" in result_lower or "two" in result_lower)
|
||||
and ("3" in result_lower or "three" in result_lower)
|
||||
and ("4" in result_lower or "four" in result_lower)
|
||||
and ("5" in result_lower or "five" in result_lower)
|
||||
)
|
||||
|
||||
|
||||
def test_exceptions() -> None:
|
||||
# Check that an exception is raised when trying to convert an unsupported format
|
||||
markitdown = MarkItDown()
|
||||
@@ -295,17 +520,20 @@ def test_markitdown_exiftool() -> None:
|
||||
# Test the automatic discovery of exiftool throws a warning
|
||||
# and is disabled
|
||||
try:
|
||||
with catch_warnings(record=True) as w:
|
||||
warnings.simplefilter("default")
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||
assert len(w) == 1
|
||||
assert w[0].category is DeprecationWarning
|
||||
assert result.text_content.strip() == ""
|
||||
finally:
|
||||
resetwarnings()
|
||||
warnings.resetwarnings()
|
||||
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
assert which_exiftool is not None
|
||||
|
||||
# Test explicitly setting the location of exiftool
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||
for key in JPG_TEST_EXIFTOOL:
|
||||
@@ -320,6 +548,12 @@ def test_markitdown_exiftool() -> None:
|
||||
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
||||
assert target in result.text_content
|
||||
|
||||
# Test some other media types
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3"))
|
||||
for key in MP3_TEST_EXIFTOOL:
|
||||
target = f"{key}: {MP3_TEST_EXIFTOOL[key]}"
|
||||
assert target in result.text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_llm,
|
||||
@@ -330,7 +564,6 @@ def test_markitdown_llm() -> None:
|
||||
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||||
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
|
||||
|
||||
for test_string in LLM_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
@@ -339,12 +572,24 @@ def test_markitdown_llm() -> None:
|
||||
for test_string in ["red", "circle", "blue", "square"]:
|
||||
assert test_string in result.text_content.lower()
|
||||
|
||||
# Images embedded in PPTX files
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
|
||||
# LLM Captions are included
|
||||
for test_string in LLM_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
# Standard alt text is included
|
||||
validate_strings(result, PPTX_TEST_STRINGS)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
test_stream_info_operations()
|
||||
test_stream_info_guesses()
|
||||
test_markitdown_remote()
|
||||
test_markitdown_local()
|
||||
test_markitdown_streams()
|
||||
test_speech_transcription()
|
||||
test_exceptions()
|
||||
test_markitdown_exiftool()
|
||||
# test_markitdown_llm()
|
||||
test_markitdown_llm()
|
||||
print("All tests passed!")
|
||||
|
||||
Reference in New Issue
Block a user