Update converter API, user streams rather than file paths (#1088)
* Updated DocumentConverter interface * Updated all DocumentConverter classes * Added support for various new audio files. * Updated sample plugin to new DocumentConverter interface. * Updated project README with notes about changes, and use-cases. * Updated DocumentConverter documentation. * Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple. --------- Co-authored-by: Kenny Zhang <kzhang678@gmail.com>
This commit is contained in:
3
.gitattributes
vendored
3
.gitattributes
vendored
@@ -1 +1,2 @@
|
|||||||
tests/test_files/** linguist-vendored
|
packages/markitdown/tests/test_files/** linguist-vendored
|
||||||
|
packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
|
||||||
|
|||||||
17
README.md
17
README.md
@@ -7,9 +7,11 @@
|
|||||||
> [!IMPORTANT]
|
> [!IMPORTANT]
|
||||||
> Breaking changes between 0.0.1 to 0.0.2:
|
> Breaking changes between 0.0.1 to 0.0.2:
|
||||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior.
|
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior.
|
||||||
|
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
|
||||||
|
|
||||||
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
|
||||||
It supports:
|
|
||||||
|
At present, MarkItDown supports:
|
||||||
|
|
||||||
- PDF
|
- PDF
|
||||||
- PowerPoint
|
- PowerPoint
|
||||||
@@ -23,6 +25,17 @@ It supports:
|
|||||||
- Youtube URLs
|
- Youtube URLs
|
||||||
- ... and more!
|
- ... and more!
|
||||||
|
|
||||||
|
## Why Markdown?
|
||||||
|
|
||||||
|
Markdown is extremely close to plain text, with minimal markup or formatting, but still
|
||||||
|
provides a way to represent important document structure. Mainstream LLMs, such as
|
||||||
|
OpenAI's GPT-4o, natively "_speak_" Markdown, and often incorporate Markdown into their
|
||||||
|
responses unprompted. This suggests that they have been trained on vast amounts of
|
||||||
|
Markdown-formatted text, and understand it well. As a side benefit, Markdown conventions
|
||||||
|
are also highly token-efficient.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
|
To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -10,23 +10,38 @@ This project shows how to create a sample plugin for MarkItDown. The most import
|
|||||||
Next, implement your custom DocumentConverter:
|
Next, implement your custom DocumentConverter:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from typing import Union
|
from typing import BinaryIO, Any
|
||||||
from markitdown import DocumentConverter, DocumentConverterResult
|
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
|
||||||
|
|
||||||
class RtfConverter(DocumentConverter):
|
class RtfConverter(DocumentConverter):
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Bail if not an RTF file
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() != ".rtf":
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Implement the conversion logic here ...
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
# Return the result
|
def accepts(
|
||||||
return DocumentConverterResult(
|
self,
|
||||||
title=title,
|
file_stream: BinaryIO,
|
||||||
text_content=text_content,
|
stream_info: StreamInfo,
|
||||||
)
|
**kwargs: Any,
|
||||||
|
) -> bool:
|
||||||
|
|
||||||
|
# Implement logic to check if the file stream is an RTF file
|
||||||
|
# ...
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
|
||||||
|
# Implement logic to convert the file stream to Markdown
|
||||||
|
# ...
|
||||||
|
raise NotImplementedError()
|
||||||
```
|
```
|
||||||
|
|
||||||
Next, make sure your package implements and exports the following:
|
Next, make sure your package implements and exports the following:
|
||||||
@@ -71,10 +86,10 @@ Once the plugin package is installed, verify that it is available to MarkItDown
|
|||||||
markitdown --list-plugins
|
markitdown --list-plugins
|
||||||
```
|
```
|
||||||
|
|
||||||
To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert a PDF:
|
To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert an RTF file:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
markitdown --use-plugins path-to-file.pdf
|
markitdown --use-plugins path-to-file.rtf
|
||||||
```
|
```
|
||||||
|
|
||||||
In Python, plugins can be enabled as follows:
|
In Python, plugins can be enabled as follows:
|
||||||
@@ -83,7 +98,7 @@ In Python, plugins can be enabled as follows:
|
|||||||
from markitdown import MarkItDown
|
from markitdown import MarkItDown
|
||||||
|
|
||||||
md = MarkItDown(enable_plugins=True)
|
md = MarkItDown(enable_plugins=True)
|
||||||
result = md.convert("path-to-file.pdf")
|
result = md.convert("path-to-file.rtf")
|
||||||
print(result.text_content)
|
print(result.text_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ classifiers = [
|
|||||||
"Programming Language :: Python :: Implementation :: PyPy",
|
"Programming Language :: Python :: Implementation :: PyPy",
|
||||||
]
|
]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"markitdown",
|
"markitdown>=0.0.2a2",
|
||||||
"striprtf",
|
"striprtf",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.0.1a2"
|
__version__ = "0.0.1a3"
|
||||||
|
|||||||
@@ -1,12 +1,26 @@
|
|||||||
from typing import Union
|
import locale
|
||||||
|
from typing import BinaryIO, Any
|
||||||
from striprtf.striprtf import rtf_to_text
|
from striprtf.striprtf import rtf_to_text
|
||||||
|
|
||||||
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult
|
from markitdown import (
|
||||||
|
MarkItDown,
|
||||||
|
DocumentConverter,
|
||||||
|
DocumentConverterResult,
|
||||||
|
StreamInfo,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__plugin_interface_version__ = (
|
__plugin_interface_version__ = (
|
||||||
1 # The version of the plugin interface that this plugin uses
|
1 # The version of the plugin interface that this plugin uses
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/rtf",
|
||||||
|
"application/rtf",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".rtf"]
|
||||||
|
|
||||||
|
|
||||||
def register_converters(markitdown: MarkItDown, **kwargs):
|
def register_converters(markitdown: MarkItDown, **kwargs):
|
||||||
"""
|
"""
|
||||||
@@ -22,18 +36,41 @@ class RtfConverter(DocumentConverter):
|
|||||||
Converts an RTF file to in the simplest possible way.
|
Converts an RTF file to in the simplest possible way.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def __init__(
|
||||||
# Bail if not a RTF
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
extension = kwargs.get("file_extension", "")
|
):
|
||||||
if extension.lower() != ".rtf":
|
super().__init__(priority=priority)
|
||||||
return None
|
|
||||||
|
|
||||||
# Read the RTF file
|
def accepts(
|
||||||
with open(local_path, "r") as f:
|
self,
|
||||||
rtf = f.read()
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
# Read the file stream into an str using hte provided charset encoding, or using the system default
|
||||||
|
encoding = stream_info.charset or locale.getpreferredencoding()
|
||||||
|
stream_data = file_stream.read().decode(encoding)
|
||||||
|
|
||||||
# Return the result
|
# Return the result
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=rtf_to_text(rtf),
|
markdown=rtf_to_text(stream_data),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
import os
|
import os
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from markitdown import MarkItDown
|
from markitdown import MarkItDown, StreamInfo
|
||||||
from markitdown_sample_plugin import RtfConverter
|
from markitdown_sample_plugin import RtfConverter
|
||||||
|
|
||||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||||
@@ -15,18 +15,22 @@ RTF_TEST_STRINGS = {
|
|||||||
|
|
||||||
def test_converter() -> None:
|
def test_converter() -> None:
|
||||||
"""Tests the RTF converter dirctly."""
|
"""Tests the RTF converter dirctly."""
|
||||||
converter = RtfConverter()
|
with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
|
||||||
result = converter.convert(
|
converter = RtfConverter()
|
||||||
os.path.join(TEST_FILES_DIR, "test.rtf"), file_extension=".rtf"
|
result = converter.convert(
|
||||||
)
|
file_stream=file_stream,
|
||||||
|
stream_info=StreamInfo(
|
||||||
|
mimetype="text/rtf", extension=".rtf", filename="test.rtf"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
for test_string in RTF_TEST_STRINGS:
|
for test_string in RTF_TEST_STRINGS:
|
||||||
assert test_string in result.text_content
|
assert test_string in result.text_content
|
||||||
|
|
||||||
|
|
||||||
def test_markitdown() -> None:
|
def test_markitdown() -> None:
|
||||||
"""Tests that MarkItDown correctly loads the plugin."""
|
"""Tests that MarkItDown correctly loads the plugin."""
|
||||||
md = MarkItDown()
|
md = MarkItDown(enable_plugins=True)
|
||||||
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
|
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
|
||||||
|
|
||||||
for test_string in RTF_TEST_STRINGS:
|
for test_string in RTF_TEST_STRINGS:
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ classifiers = [
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"requests",
|
"requests",
|
||||||
"markdownify~=0.14.1",
|
"markdownify",
|
||||||
"puremagic",
|
"puremagic",
|
||||||
"pathvalidate",
|
"pathvalidate",
|
||||||
"charset-normalizer",
|
"charset-normalizer",
|
||||||
@@ -78,11 +78,14 @@ extra-dependencies = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[tool.hatch.envs.types]
|
[tool.hatch.envs.types]
|
||||||
|
features = ["all"]
|
||||||
extra-dependencies = [
|
extra-dependencies = [
|
||||||
|
"openai",
|
||||||
"mypy>=1.0.0",
|
"mypy>=1.0.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.hatch.envs.types.scripts]
|
[tool.hatch.envs.types.scripts]
|
||||||
check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
|
check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}"
|
||||||
|
|
||||||
[tool.coverage.run]
|
[tool.coverage.run]
|
||||||
source_pkgs = ["markitdown", "tests"]
|
source_pkgs = ["markitdown", "tests"]
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.0.2a1"
|
__version__ = "0.0.2a2"
|
||||||
|
|||||||
@@ -3,7 +3,13 @@
|
|||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
from .__about__ import __version__
|
from .__about__ import __version__
|
||||||
from ._markitdown import MarkItDown
|
from ._markitdown import (
|
||||||
|
MarkItDown,
|
||||||
|
PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||||
|
PRIORITY_GENERIC_FILE_FORMAT,
|
||||||
|
)
|
||||||
|
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||||
|
from ._stream_info import StreamInfo
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
MarkItDownException,
|
MarkItDownException,
|
||||||
MissingDependencyException,
|
MissingDependencyException,
|
||||||
@@ -11,7 +17,6 @@ from ._exceptions import (
|
|||||||
FileConversionException,
|
FileConversionException,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
)
|
)
|
||||||
from .converters import DocumentConverter, DocumentConverterResult
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"__version__",
|
"__version__",
|
||||||
@@ -23,4 +28,7 @@ __all__ = [
|
|||||||
"FailedConversionAttempt",
|
"FailedConversionAttempt",
|
||||||
"FileConversionException",
|
"FileConversionException",
|
||||||
"UnsupportedFormatException",
|
"UnsupportedFormatException",
|
||||||
|
"StreamInfo",
|
||||||
|
"PRIORITY_SPECIFIC_FILE_FORMAT",
|
||||||
|
"PRIORITY_GENERIC_FILE_FORMAT",
|
||||||
]
|
]
|
||||||
|
|||||||
108
packages/markitdown/src/markitdown/_base_converter.py
Normal file
108
packages/markitdown/src/markitdown/_base_converter.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from warnings import warn
|
||||||
|
from typing import Any, Union, BinaryIO, Optional, List
|
||||||
|
from ._stream_info import StreamInfo
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentConverterResult:
|
||||||
|
"""The result of converting a document to Markdown."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
markdown: str,
|
||||||
|
*,
|
||||||
|
title: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the DocumentConverterResult.
|
||||||
|
|
||||||
|
The only required parameter is the converted Markdown text.
|
||||||
|
The title, and any other metadata that may be added in the future, are optional.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- markdown: The converted Markdown text.
|
||||||
|
- title: Optional title of the document.
|
||||||
|
"""
|
||||||
|
self.markdown = markdown
|
||||||
|
self.title = title
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text_content(self) -> str:
|
||||||
|
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
||||||
|
return self.markdown
|
||||||
|
|
||||||
|
@text_content.setter
|
||||||
|
def text_content(self, markdown: str):
|
||||||
|
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
||||||
|
self.markdown = markdown
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
"""Return the converted Markdown text."""
|
||||||
|
return self.markdown
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentConverter:
|
||||||
|
"""Abstract superclass of all DocumentConverters."""
|
||||||
|
|
||||||
|
def accepts(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Return a quick determination on if the converter should attempt converting the document.
|
||||||
|
This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
|
||||||
|
In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to
|
||||||
|
make a determination (e.g., special converters for Wikipedia, YouTube etc).
|
||||||
|
Finally, it is conceivable that the `stream_info.filename` might be used to in cases
|
||||||
|
where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
|
||||||
|
|
||||||
|
NOTE: The method signature is designed to match that of the convert() method. This provides some
|
||||||
|
assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
|
||||||
|
|
||||||
|
IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final
|
||||||
|
determination. Read operations inevitably advances the position in file_stream. In these case, the position
|
||||||
|
MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately
|
||||||
|
after accepts(), and will expect the file_stream to be at the original position.
|
||||||
|
|
||||||
|
E.g.,
|
||||||
|
cur_pos = file_stream.tell() # Save the current position
|
||||||
|
data = file_stream.read(100) # ... peek at the first 100 bytes, etc.
|
||||||
|
file_stream.seek(cur_pos) # Reset the position to the original position
|
||||||
|
|
||||||
|
Prameters:
|
||||||
|
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||||
|
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
|
||||||
|
- kwargs: Additional keyword arguments for the converter.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- bool: True if the converter can handle the document, False otherwise.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
|
||||||
|
)
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
"""
|
||||||
|
Convert a document to Markdown text.
|
||||||
|
|
||||||
|
Prameters:
|
||||||
|
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||||
|
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
|
||||||
|
- kwargs: Additional keyword arguments for the converter.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
|
||||||
|
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("Subclasses must implement this method")
|
||||||
@@ -68,6 +68,9 @@ class FileConversionException(MarkItDownException):
|
|||||||
else:
|
else:
|
||||||
message = f"File conversion failed after {len(attempts)} attempts:\n"
|
message = f"File conversion failed after {len(attempts)} attempts:\n"
|
||||||
for attempt in attempts:
|
for attempt in attempts:
|
||||||
message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
|
if attempt.exc_info is None:
|
||||||
|
message += " - {type(attempt.converter).__name__} provided no execution info."
|
||||||
|
else:
|
||||||
|
message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
|
||||||
|
|
||||||
super().__init__(message)
|
super().__init__(message)
|
||||||
|
|||||||
@@ -6,8 +6,10 @@ import sys
|
|||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
import traceback
|
import traceback
|
||||||
|
import io
|
||||||
|
from dataclasses import dataclass
|
||||||
from importlib.metadata import entry_points
|
from importlib.metadata import entry_points
|
||||||
from typing import Any, List, Optional, Union
|
from typing import Any, List, Optional, Union, BinaryIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
@@ -16,9 +18,9 @@ from warnings import warn
|
|||||||
import puremagic
|
import puremagic
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from ._stream_info import StreamInfo, _guess_stream_info_from_stream
|
||||||
|
|
||||||
from .converters import (
|
from .converters import (
|
||||||
DocumentConverter,
|
|
||||||
DocumentConverterResult,
|
|
||||||
PlainTextConverter,
|
PlainTextConverter,
|
||||||
HtmlConverter,
|
HtmlConverter,
|
||||||
RssConverter,
|
RssConverter,
|
||||||
@@ -32,26 +34,34 @@ from .converters import (
|
|||||||
XlsConverter,
|
XlsConverter,
|
||||||
PptxConverter,
|
PptxConverter,
|
||||||
ImageConverter,
|
ImageConverter,
|
||||||
WavConverter,
|
AudioConverter,
|
||||||
Mp3Converter,
|
|
||||||
OutlookMsgConverter,
|
OutlookMsgConverter,
|
||||||
ZipConverter,
|
ZipConverter,
|
||||||
DocumentIntelligenceConverter,
|
DocumentIntelligenceConverter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from ._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
FileConversionException,
|
FileConversionException,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
FailedConversionAttempt,
|
FailedConversionAttempt,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Override mimetype for csv to fix issue on windows
|
|
||||||
mimetypes.add_type("text/csv", ".csv")
|
|
||||||
|
|
||||||
_plugins: Union[None | List[Any]] = None
|
# Lower priority values are tried first.
|
||||||
|
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||||
|
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||||
|
)
|
||||||
|
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||||
|
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _load_plugins() -> Union[None | List[Any]]:
|
_plugins: List[Any] = []
|
||||||
|
|
||||||
|
|
||||||
|
def _load_plugins() -> List[Any]:
|
||||||
"""Lazy load plugins, exiting early if already loaded."""
|
"""Lazy load plugins, exiting early if already loaded."""
|
||||||
global _plugins
|
global _plugins
|
||||||
|
|
||||||
@@ -71,6 +81,14 @@ def _load_plugins() -> Union[None | List[Any]]:
|
|||||||
return _plugins
|
return _plugins
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True, frozen=True)
|
||||||
|
class ConverterRegistration:
|
||||||
|
"""A registration of a converter with its priority and other metadata."""
|
||||||
|
|
||||||
|
converter: DocumentConverter
|
||||||
|
priority: float
|
||||||
|
|
||||||
|
|
||||||
class MarkItDown:
|
class MarkItDown:
|
||||||
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
||||||
This reader will convert common file-types or webpages to Markdown."""
|
This reader will convert common file-types or webpages to Markdown."""
|
||||||
@@ -92,13 +110,13 @@ class MarkItDown:
|
|||||||
self._requests_session = requests_session
|
self._requests_session = requests_session
|
||||||
|
|
||||||
# TODO - remove these (see enable_builtins)
|
# TODO - remove these (see enable_builtins)
|
||||||
self._llm_client = None
|
self._llm_client: Any = None
|
||||||
self._llm_model = None
|
self._llm_model: Union[str | None] = None
|
||||||
self._exiftool_path = None
|
self._exiftool_path: Union[str | None] = None
|
||||||
self._style_map = None
|
self._style_map: Union[str | None] = None
|
||||||
|
|
||||||
# Register the converters
|
# Register the converters
|
||||||
self._page_converters: List[DocumentConverter] = []
|
self._converters: List[ConverterRegistration] = []
|
||||||
|
|
||||||
if (
|
if (
|
||||||
enable_builtins is None or enable_builtins
|
enable_builtins is None or enable_builtins
|
||||||
@@ -126,9 +144,15 @@ class MarkItDown:
|
|||||||
# Register converters for successful browsing operations
|
# Register converters for successful browsing operations
|
||||||
# Later registrations are tried first / take higher priority than earlier registrations
|
# Later registrations are tried first / take higher priority than earlier registrations
|
||||||
# To this end, the most specific converters should appear below the most generic converters
|
# To this end, the most specific converters should appear below the most generic converters
|
||||||
self.register_converter(PlainTextConverter())
|
self.register_converter(
|
||||||
self.register_converter(ZipConverter())
|
PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||||
self.register_converter(HtmlConverter())
|
)
|
||||||
|
self.register_converter(
|
||||||
|
ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||||
|
)
|
||||||
|
self.register_converter(
|
||||||
|
HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||||
|
)
|
||||||
self.register_converter(RssConverter())
|
self.register_converter(RssConverter())
|
||||||
self.register_converter(WikipediaConverter())
|
self.register_converter(WikipediaConverter())
|
||||||
self.register_converter(YouTubeConverter())
|
self.register_converter(YouTubeConverter())
|
||||||
@@ -137,8 +161,7 @@ class MarkItDown:
|
|||||||
self.register_converter(XlsxConverter())
|
self.register_converter(XlsxConverter())
|
||||||
self.register_converter(XlsConverter())
|
self.register_converter(XlsConverter())
|
||||||
self.register_converter(PptxConverter())
|
self.register_converter(PptxConverter())
|
||||||
self.register_converter(WavConverter())
|
self.register_converter(AudioConverter())
|
||||||
self.register_converter(Mp3Converter())
|
|
||||||
self.register_converter(ImageConverter())
|
self.register_converter(ImageConverter())
|
||||||
self.register_converter(IpynbConverter())
|
self.register_converter(IpynbConverter())
|
||||||
self.register_converter(PdfConverter())
|
self.register_converter(PdfConverter())
|
||||||
@@ -174,12 +197,17 @@ class MarkItDown:
|
|||||||
warn("Plugins converters are already enabled.", RuntimeWarning)
|
warn("Plugins converters are already enabled.", RuntimeWarning)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
self,
|
||||||
|
source: Union[str, requests.Response, Path, BinaryIO],
|
||||||
|
*,
|
||||||
|
stream_info: Optional[StreamInfo] = None,
|
||||||
|
**kwargs: Any,
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
|
- source: can be a path (str or Path), url, or a requests.response object
|
||||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
- stream_info: optional stream info to use for the conversion. If None, infer from source
|
||||||
|
- kwargs: additional arguments to pass to the converter
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Local path or url
|
# Local path or url
|
||||||
@@ -191,68 +219,120 @@ class MarkItDown:
|
|||||||
):
|
):
|
||||||
return self.convert_url(source, **kwargs)
|
return self.convert_url(source, **kwargs)
|
||||||
else:
|
else:
|
||||||
return self.convert_local(source, **kwargs)
|
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||||
|
# Path object
|
||||||
|
elif isinstance(source, Path):
|
||||||
|
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||||
# Request response
|
# Request response
|
||||||
elif isinstance(source, requests.Response):
|
elif isinstance(source, requests.Response):
|
||||||
return self.convert_response(source, **kwargs)
|
return self.convert_response(source, **kwargs)
|
||||||
elif isinstance(source, Path):
|
# Binary stream
|
||||||
return self.convert_local(source, **kwargs)
|
elif (
|
||||||
|
hasattr(source, "read")
|
||||||
|
and callable(source.read)
|
||||||
|
and not isinstance(source, io.TextIOBase)
|
||||||
|
):
|
||||||
|
return self.convert_stream(source, **kwargs)
|
||||||
|
else:
|
||||||
|
raise TypeError(
|
||||||
|
f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
|
||||||
|
)
|
||||||
|
|
||||||
def convert_local(
|
def convert_local(
|
||||||
self, path: Union[str, Path], **kwargs: Any
|
self,
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
path: Union[str, Path],
|
||||||
|
*,
|
||||||
|
stream_info: Optional[StreamInfo] = None,
|
||||||
|
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DocumentConverterResult:
|
||||||
if isinstance(path, Path):
|
if isinstance(path, Path):
|
||||||
path = str(path)
|
path = str(path)
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
|
||||||
ext = kwargs.get("file_extension")
|
|
||||||
extensions = [ext] if ext is not None else []
|
|
||||||
|
|
||||||
# Get extension alternatives from the path and puremagic
|
# Build a base StreamInfo object from which to start guesses
|
||||||
base, ext = os.path.splitext(path)
|
base_stream_info = StreamInfo(
|
||||||
self._append_ext(extensions, ext)
|
local_path=path,
|
||||||
|
extension=os.path.splitext(path)[1],
|
||||||
|
filename=os.path.basename(path),
|
||||||
|
)
|
||||||
|
|
||||||
for g in self._guess_ext_magic(path):
|
# Extend the base_stream_info with any additional info from the arguments
|
||||||
self._append_ext(extensions, g)
|
if stream_info is not None:
|
||||||
|
base_stream_info = base_stream_info.copy_and_update(stream_info)
|
||||||
|
|
||||||
# Convert
|
if file_extension is not None:
|
||||||
return self._convert(path, extensions, **kwargs)
|
# Deprecated -- use stream_info
|
||||||
|
base_stream_info = base_stream_info.copy_and_update(
|
||||||
|
extension=file_extension
|
||||||
|
)
|
||||||
|
|
||||||
|
if url is not None:
|
||||||
|
# Deprecated -- use stream_info
|
||||||
|
base_stream_info = base_stream_info.copy_and_update(url=url)
|
||||||
|
|
||||||
|
with open(path, "rb") as fh:
|
||||||
|
# Prepare a list of configurations to try, starting with the base_stream_info
|
||||||
|
guesses: List[StreamInfo] = [base_stream_info]
|
||||||
|
for guess in _guess_stream_info_from_stream(
|
||||||
|
file_stream=fh, filename_hint=path
|
||||||
|
):
|
||||||
|
guesses.append(base_stream_info.copy_and_update(guess))
|
||||||
|
return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)
|
||||||
|
|
||||||
# TODO what should stream's type be?
|
|
||||||
def convert_stream(
|
def convert_stream(
|
||||||
self, stream: Any, **kwargs: Any
|
self,
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
stream: BinaryIO,
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
*,
|
||||||
ext = kwargs.get("file_extension")
|
stream_info: Optional[StreamInfo] = None,
|
||||||
extensions = [ext] if ext is not None else []
|
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
guesses: List[StreamInfo] = []
|
||||||
|
|
||||||
# Save the file locally to a temporary file. It will be deleted before this method exits
|
# Do we have anything on which to base a guess?
|
||||||
handle, temp_path = tempfile.mkstemp()
|
base_guess = None
|
||||||
fh = os.fdopen(handle, "wb")
|
if stream_info is not None or file_extension is not None or url is not None:
|
||||||
result = None
|
# Start with a non-Null base guess
|
||||||
try:
|
if stream_info is None:
|
||||||
# Write to the temporary file
|
base_guess = StreamInfo()
|
||||||
content = stream.read()
|
|
||||||
if isinstance(content, str):
|
|
||||||
fh.write(content.encode("utf-8"))
|
|
||||||
else:
|
else:
|
||||||
fh.write(content)
|
base_guess = stream_info
|
||||||
fh.close()
|
|
||||||
|
|
||||||
# Use puremagic to check for more extension options
|
if file_extension is not None:
|
||||||
for g in self._guess_ext_magic(temp_path):
|
# Deprecated -- use stream_info
|
||||||
self._append_ext(extensions, g)
|
assert base_guess is not None # for mypy
|
||||||
|
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||||
|
|
||||||
# Convert
|
if url is not None:
|
||||||
result = self._convert(temp_path, extensions, **kwargs)
|
# Deprecated -- use stream_info
|
||||||
# Clean up
|
assert base_guess is not None # for mypy
|
||||||
finally:
|
base_guess = base_guess.copy_and_update(url=url)
|
||||||
try:
|
|
||||||
fh.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
os.unlink(temp_path)
|
|
||||||
|
|
||||||
return result
|
# Append the base guess, if it's non-trivial
|
||||||
|
if base_guess is not None:
|
||||||
|
if base_guess.mimetype is not None or base_guess.extension is not None:
|
||||||
|
guesses.append(base_guess)
|
||||||
|
else:
|
||||||
|
# Create a base guess with no information
|
||||||
|
base_guess = StreamInfo()
|
||||||
|
|
||||||
|
# Create a placeholder filename to help with guessing
|
||||||
|
placeholder_filename = None
|
||||||
|
if base_guess.filename is not None:
|
||||||
|
placeholder_filename = base_guess.filename
|
||||||
|
elif base_guess.extension is not None:
|
||||||
|
placeholder_filename = "placeholder" + base_guess.extension
|
||||||
|
|
||||||
|
# Add guesses based on stream content
|
||||||
|
for guess in _guess_stream_info_from_stream(
|
||||||
|
file_stream=stream, filename_hint=placeholder_filename
|
||||||
|
):
|
||||||
|
guesses.append(base_guess.copy_and_update(guess))
|
||||||
|
|
||||||
|
# Perform the conversion
|
||||||
|
return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)
|
||||||
|
|
||||||
def convert_url(
|
def convert_url(
|
||||||
self, url: str, **kwargs: Any
|
self, url: str, **kwargs: Any
|
||||||
@@ -263,55 +343,94 @@ class MarkItDown:
|
|||||||
return self.convert_response(response, **kwargs)
|
return self.convert_response(response, **kwargs)
|
||||||
|
|
||||||
def convert_response(
|
def convert_response(
|
||||||
self, response: requests.Response, **kwargs: Any
|
self,
|
||||||
) -> DocumentConverterResult: # TODO fix kwargs type
|
response: requests.Response,
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
*,
|
||||||
ext = kwargs.get("file_extension")
|
stream_info: Optional[StreamInfo] = None,
|
||||||
extensions = [ext] if ext is not None else []
|
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
# If there is a content-type header, get the mimetype and charset (if present)
|
||||||
|
mimetype: Optional[str] = None
|
||||||
|
charset: Optional[str] = None
|
||||||
|
|
||||||
# Guess from the mimetype
|
if "content-type" in response.headers:
|
||||||
content_type = response.headers.get("content-type", "").split(";")[0]
|
parts = response.headers["content-type"].split(";")
|
||||||
self._append_ext(extensions, mimetypes.guess_extension(content_type))
|
mimetype = parts.pop(0).strip()
|
||||||
|
for part in parts:
|
||||||
|
if part.strip().startswith("charset="):
|
||||||
|
_charset = part.split("=")[1].strip()
|
||||||
|
if len(_charset) > 0:
|
||||||
|
charset = _charset
|
||||||
|
|
||||||
# Read the content disposition if there is one
|
# If there is a content-disposition header, get the filename and possibly the extension
|
||||||
content_disposition = response.headers.get("content-disposition", "")
|
filename: Optional[str] = None
|
||||||
m = re.search(r"filename=([^;]+)", content_disposition)
|
extension: Optional[str] = None
|
||||||
if m:
|
if "content-disposition" in response.headers:
|
||||||
base, ext = os.path.splitext(m.group(1).strip("\"'"))
|
m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
|
||||||
self._append_ext(extensions, ext)
|
if m:
|
||||||
|
filename = m.group(1).strip("\"'")
|
||||||
|
_, _extension = os.path.splitext(filename)
|
||||||
|
if len(_extension) > 0:
|
||||||
|
extension = _extension
|
||||||
|
|
||||||
# Read from the extension from the path
|
# If there is still no filename, try to read it from the url
|
||||||
base, ext = os.path.splitext(urlparse(response.url).path)
|
if filename is None:
|
||||||
self._append_ext(extensions, ext)
|
parsed_url = urlparse(response.url)
|
||||||
|
_, _extension = os.path.splitext(parsed_url.path)
|
||||||
|
if len(_extension) > 0: # Looks like this might be a file!
|
||||||
|
filename = os.path.basename(parsed_url.path)
|
||||||
|
extension = _extension
|
||||||
|
|
||||||
# Save the file locally to a temporary file. It will be deleted before this method exits
|
# Create an initial guess from all this information
|
||||||
handle, temp_path = tempfile.mkstemp()
|
base_guess = StreamInfo(
|
||||||
fh = os.fdopen(handle, "wb")
|
mimetype=mimetype,
|
||||||
result = None
|
charset=charset,
|
||||||
try:
|
filename=filename,
|
||||||
# Download the file
|
extension=extension,
|
||||||
for chunk in response.iter_content(chunk_size=512):
|
url=response.url,
|
||||||
fh.write(chunk)
|
)
|
||||||
fh.close()
|
|
||||||
|
|
||||||
# Use puremagic to check for more extension options
|
# Update with any additional info from the arguments
|
||||||
for g in self._guess_ext_magic(temp_path):
|
if stream_info is not None:
|
||||||
self._append_ext(extensions, g)
|
base_guess = base_guess.copy_and_update(stream_info)
|
||||||
|
if file_extension is not None:
|
||||||
|
# Deprecated -- use stream_info
|
||||||
|
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||||
|
if url is not None:
|
||||||
|
# Deprecated -- use stream_info
|
||||||
|
base_guess = base_guess.copy_and_update(url=url)
|
||||||
|
|
||||||
# Convert
|
# Add the guess if its non-trivial
|
||||||
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
|
guesses: List[StreamInfo] = []
|
||||||
# Clean up
|
if base_guess.mimetype is not None or base_guess.extension is not None:
|
||||||
finally:
|
guesses.append(base_guess)
|
||||||
try:
|
|
||||||
fh.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
os.unlink(temp_path)
|
|
||||||
|
|
||||||
return result
|
# Read into BytesIO
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
for chunk in response.iter_content(chunk_size=512):
|
||||||
|
buffer.write(chunk)
|
||||||
|
buffer.seek(0)
|
||||||
|
|
||||||
|
# Create a placeholder filename to help with guessing
|
||||||
|
placeholder_filename = None
|
||||||
|
if base_guess.filename is not None:
|
||||||
|
placeholder_filename = base_guess.filename
|
||||||
|
elif base_guess.extension is not None:
|
||||||
|
placeholder_filename = "placeholder" + base_guess.extension
|
||||||
|
|
||||||
|
# Add guesses based on stream content
|
||||||
|
for guess in _guess_stream_info_from_stream(
|
||||||
|
file_stream=buffer, filename_hint=placeholder_filename
|
||||||
|
):
|
||||||
|
guesses.append(base_guess.copy_and_update(guess))
|
||||||
|
|
||||||
|
# Convert
|
||||||
|
return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)
|
||||||
|
|
||||||
def _convert(
|
def _convert(
|
||||||
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
res: Union[None, DocumentConverterResult] = None
|
res: Union[None, DocumentConverterResult] = None
|
||||||
|
|
||||||
@@ -321,19 +440,21 @@ class MarkItDown:
|
|||||||
# Create a copy of the page_converters list, sorted by priority.
|
# Create a copy of the page_converters list, sorted by priority.
|
||||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||||
sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
|
sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
|
||||||
|
|
||||||
|
# Remember the initial stream position so that we can return to it
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
|
||||||
|
for stream_info in stream_info_guesses + [StreamInfo()]:
|
||||||
|
for converter_registration in sorted_registrations:
|
||||||
|
converter = converter_registration.converter
|
||||||
|
# Sanity check -- make sure the cur_pos is still the same
|
||||||
|
assert (
|
||||||
|
cur_pos == file_stream.tell()
|
||||||
|
), f"File stream position should NOT change between guess iterations"
|
||||||
|
|
||||||
for ext in extensions + [None]: # Try last with no extension
|
|
||||||
for converter in sorted_converters:
|
|
||||||
_kwargs = copy.deepcopy(kwargs)
|
_kwargs = copy.deepcopy(kwargs)
|
||||||
|
|
||||||
# Overwrite file_extension appropriately
|
|
||||||
if ext is None:
|
|
||||||
if "file_extension" in _kwargs:
|
|
||||||
del _kwargs["file_extension"]
|
|
||||||
else:
|
|
||||||
_kwargs.update({"file_extension": ext})
|
|
||||||
|
|
||||||
# Copy any additional global options
|
# Copy any additional global options
|
||||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||||
_kwargs["llm_client"] = self._llm_client
|
_kwargs["llm_client"] = self._llm_client
|
||||||
@@ -348,17 +469,40 @@ class MarkItDown:
|
|||||||
_kwargs["exiftool_path"] = self._exiftool_path
|
_kwargs["exiftool_path"] = self._exiftool_path
|
||||||
|
|
||||||
# Add the list of converters for nested processing
|
# Add the list of converters for nested processing
|
||||||
_kwargs["_parent_converters"] = self._page_converters
|
_kwargs["_parent_converters"] = self._converters
|
||||||
|
|
||||||
# If we hit an error log it and keep trying
|
# Add legaxy kwargs
|
||||||
|
if stream_info is not None:
|
||||||
|
if stream_info.extension is not None:
|
||||||
|
_kwargs["file_extension"] = stream_info.extension
|
||||||
|
|
||||||
|
if stream_info.url is not None:
|
||||||
|
_kwargs["url"] = stream_info.url
|
||||||
|
|
||||||
|
# Check if the converter will accept the file, and if so, try to convert it
|
||||||
|
_accepts = False
|
||||||
try:
|
try:
|
||||||
res = converter.convert(local_path, **_kwargs)
|
_accepts = converter.accepts(file_stream, stream_info, **_kwargs)
|
||||||
except Exception:
|
except NotImplementedError:
|
||||||
failed_attempts.append(
|
pass
|
||||||
FailedConversionAttempt(
|
|
||||||
converter=converter, exc_info=sys.exc_info()
|
# accept() should not have changed the file stream position
|
||||||
|
assert (
|
||||||
|
cur_pos == file_stream.tell()
|
||||||
|
), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
|
||||||
|
|
||||||
|
# Attempt the conversion
|
||||||
|
if _accepts:
|
||||||
|
try:
|
||||||
|
res = converter.convert(file_stream, stream_info, **_kwargs)
|
||||||
|
except Exception:
|
||||||
|
failed_attempts.append(
|
||||||
|
FailedConversionAttempt(
|
||||||
|
converter=converter, exc_info=sys.exc_info()
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
if res is not None:
|
if res is not None:
|
||||||
# Normalize the content
|
# Normalize the content
|
||||||
@@ -366,8 +510,6 @@ class MarkItDown:
|
|||||||
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
||||||
)
|
)
|
||||||
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
||||||
|
|
||||||
# Todo
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
# If we got this far without success, report any exceptions
|
# If we got this far without success, report any exceptions
|
||||||
@@ -376,61 +518,9 @@ class MarkItDown:
|
|||||||
|
|
||||||
# Nothing can handle it!
|
# Nothing can handle it!
|
||||||
raise UnsupportedFormatException(
|
raise UnsupportedFormatException(
|
||||||
f"Could not convert '{local_path}' to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _append_ext(self, extensions, ext):
|
|
||||||
"""Append a unique non-None, non-empty extension to a list of extensions."""
|
|
||||||
if ext is None:
|
|
||||||
return
|
|
||||||
ext = ext.strip()
|
|
||||||
if ext == "":
|
|
||||||
return
|
|
||||||
if ext in extensions:
|
|
||||||
return
|
|
||||||
extensions.append(ext)
|
|
||||||
|
|
||||||
def _guess_ext_magic(self, path):
|
|
||||||
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
|
||||||
# Use puremagic to guess
|
|
||||||
try:
|
|
||||||
guesses = puremagic.magic_file(path)
|
|
||||||
|
|
||||||
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
|
||||||
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
|
||||||
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
|
||||||
# (space, tab, newline, carriage return, vertical tab, form feed).
|
|
||||||
if len(guesses) == 0:
|
|
||||||
with open(path, "rb") as file:
|
|
||||||
while True:
|
|
||||||
char = file.read(1)
|
|
||||||
if not char: # End of file
|
|
||||||
break
|
|
||||||
if not char.isspace():
|
|
||||||
file.seek(file.tell() - 1)
|
|
||||||
break
|
|
||||||
try:
|
|
||||||
guesses = puremagic.magic_stream(file)
|
|
||||||
except puremagic.main.PureError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
extensions = list()
|
|
||||||
for g in guesses:
|
|
||||||
ext = g.extension.strip()
|
|
||||||
if len(ext) > 0:
|
|
||||||
if not ext.startswith("."):
|
|
||||||
ext = "." + ext
|
|
||||||
if ext not in extensions:
|
|
||||||
extensions.append(ext)
|
|
||||||
return extensions
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
except IsADirectoryError:
|
|
||||||
pass
|
|
||||||
except PermissionError:
|
|
||||||
pass
|
|
||||||
return []
|
|
||||||
|
|
||||||
def register_page_converter(self, converter: DocumentConverter) -> None:
|
def register_page_converter(self, converter: DocumentConverter) -> None:
|
||||||
"""DEPRECATED: User register_converter instead."""
|
"""DEPRECATED: User register_converter instead."""
|
||||||
warn(
|
warn(
|
||||||
@@ -439,6 +529,34 @@ class MarkItDown:
|
|||||||
)
|
)
|
||||||
self.register_converter(converter)
|
self.register_converter(converter)
|
||||||
|
|
||||||
def register_converter(self, converter: DocumentConverter) -> None:
|
def register_converter(
|
||||||
"""Register a page text converter."""
|
self,
|
||||||
self._page_converters.insert(0, converter)
|
converter: DocumentConverter,
|
||||||
|
*,
|
||||||
|
priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Register a DocumentConverter with a given priority.
|
||||||
|
|
||||||
|
Priorities work as follows: By default, most converters get priority
|
||||||
|
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||||
|
is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
|
||||||
|
priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
|
||||||
|
being tried first (i.e., higher priority).
|
||||||
|
|
||||||
|
Just prior to conversion, the converters are sorted by priority, using
|
||||||
|
a stable sort. This means that converters with the same priority will
|
||||||
|
remain in the same order, with the most recently registered converters
|
||||||
|
appearing first.
|
||||||
|
|
||||||
|
We have tight control over the order of built-in converters, but
|
||||||
|
plugins can register converters in any order. The registration's priority
|
||||||
|
field reasserts some control over the order of converters.
|
||||||
|
|
||||||
|
Plugins can register converters with any priority, to appear before or
|
||||||
|
after the built-ins. For example, a plugin with priority 9 will run
|
||||||
|
before the PlainTextConverter, but after the built-in converters.
|
||||||
|
"""
|
||||||
|
self._converters.insert(
|
||||||
|
0, ConverterRegistration(converter=converter, priority=priority)
|
||||||
|
)
|
||||||
|
|||||||
122
packages/markitdown/src/markitdown/_stream_info.py
Normal file
122
packages/markitdown/src/markitdown/_stream_info.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
import puremagic
|
||||||
|
import mimetypes
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from typing import Optional, BinaryIO, List, TypeVar, Type
|
||||||
|
|
||||||
|
# Mimetype substitutions table
|
||||||
|
MIMETYPE_SUBSTITUTIONS = {
|
||||||
|
"application/excel": "application/vnd.ms-excel",
|
||||||
|
"application/mspowerpoint": "application/vnd.ms-powerpoint",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True, frozen=True)
|
||||||
|
class StreamInfo:
|
||||||
|
"""The StreamInfo class is used to store information about a file stream.
|
||||||
|
All fields can be None, and will depend on how the stream was opened.
|
||||||
|
"""
|
||||||
|
|
||||||
|
mimetype: Optional[str] = None
|
||||||
|
extension: Optional[str] = None
|
||||||
|
charset: Optional[str] = None
|
||||||
|
filename: Optional[
|
||||||
|
str
|
||||||
|
] = None # From local path, url, or Content-Disposition header
|
||||||
|
local_path: Optional[str] = None # If read from disk
|
||||||
|
url: Optional[str] = None # If read from url
|
||||||
|
|
||||||
|
def copy_and_update(self, *args, **kwargs):
|
||||||
|
"""Copy the StreamInfo object and update it with the given StreamInfo
|
||||||
|
instance and/or other keyword arguments."""
|
||||||
|
new_info = asdict(self)
|
||||||
|
|
||||||
|
for si in args:
|
||||||
|
assert isinstance(si, StreamInfo)
|
||||||
|
new_info.update({k: v for k, v in asdict(si).items() if v is not None})
|
||||||
|
|
||||||
|
if len(kwargs) > 0:
|
||||||
|
new_info.update(kwargs)
|
||||||
|
|
||||||
|
return StreamInfo(**new_info)
|
||||||
|
|
||||||
|
|
||||||
|
# Behavior subject to change.
|
||||||
|
# Do not rely on this outside of this module.
|
||||||
|
def _guess_stream_info_from_stream(
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
*,
|
||||||
|
filename_hint: Optional[str] = None,
|
||||||
|
) -> List[StreamInfo]:
|
||||||
|
"""
|
||||||
|
Guess StreamInfo properties (mostly mimetype and extension) from a stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- stream: The stream to guess the StreamInfo from.
|
||||||
|
- filename_hint [Optional]: A filename hint to help with the guessing (may be a placeholder, and not actually be the file name)
|
||||||
|
|
||||||
|
Returns a list of StreamInfo objects in order of confidence.
|
||||||
|
"""
|
||||||
|
guesses: List[StreamInfo] = []
|
||||||
|
|
||||||
|
# Add a guess purely based on the filename hint
|
||||||
|
if filename_hint:
|
||||||
|
try:
|
||||||
|
# Requires Python 3.13+
|
||||||
|
mimetype, _ = mimetypes.guess_file_type(filename_hint) # type: ignore
|
||||||
|
except AttributeError:
|
||||||
|
mimetype, _ = mimetypes.guess_type(filename_hint)
|
||||||
|
|
||||||
|
if mimetype:
|
||||||
|
guesses.append(
|
||||||
|
StreamInfo(
|
||||||
|
mimetype=mimetype, extension=os.path.splitext(filename_hint)[1]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _puremagic(
|
||||||
|
file_stream, filename_hint
|
||||||
|
) -> List[puremagic.main.PureMagicWithConfidence]:
|
||||||
|
"""Wrap guesses to handle exceptions."""
|
||||||
|
try:
|
||||||
|
return puremagic.magic_stream(file_stream, filename=filename_hint)
|
||||||
|
except puremagic.main.PureError as e:
|
||||||
|
return []
|
||||||
|
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
|
||||||
|
if len(type_guesses) == 0:
|
||||||
|
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
||||||
|
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
||||||
|
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
||||||
|
# (space, tab, newline, carriage return, vertical tab, form feed).
|
||||||
|
|
||||||
|
# Eat all the leading whitespace
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
while True:
|
||||||
|
char = file_stream.read(1)
|
||||||
|
if not char: # End of file
|
||||||
|
break
|
||||||
|
if not char.isspace():
|
||||||
|
file_stream.seek(file_stream.tell() - 1)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Try again
|
||||||
|
type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
# Convert and return the guesses
|
||||||
|
for guess in type_guesses:
|
||||||
|
kwargs: dict[str, str] = {}
|
||||||
|
if guess.extension:
|
||||||
|
kwargs["extension"] = guess.extension
|
||||||
|
if guess.mime_type:
|
||||||
|
kwargs["mimetype"] = MIMETYPE_SUBSTITUTIONS.get(
|
||||||
|
guess.mime_type, guess.mime_type
|
||||||
|
)
|
||||||
|
if len(kwargs) > 0:
|
||||||
|
# We don't add the filename_hint, because sometimes it's just a placeholder,
|
||||||
|
# and, in any case, doesn't add new information.
|
||||||
|
guesses.append(StreamInfo(**kwargs))
|
||||||
|
|
||||||
|
return guesses
|
||||||
@@ -2,7 +2,6 @@
|
|||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
|
||||||
from ._plain_text_converter import PlainTextConverter
|
from ._plain_text_converter import PlainTextConverter
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from ._rss_converter import RssConverter
|
from ._rss_converter import RssConverter
|
||||||
@@ -15,15 +14,12 @@ from ._docx_converter import DocxConverter
|
|||||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||||
from ._pptx_converter import PptxConverter
|
from ._pptx_converter import PptxConverter
|
||||||
from ._image_converter import ImageConverter
|
from ._image_converter import ImageConverter
|
||||||
from ._wav_converter import WavConverter
|
from ._audio_converter import AudioConverter
|
||||||
from ._mp3_converter import Mp3Converter
|
|
||||||
from ._outlook_msg_converter import OutlookMsgConverter
|
from ._outlook_msg_converter import OutlookMsgConverter
|
||||||
from ._zip_converter import ZipConverter
|
from ._zip_converter import ZipConverter
|
||||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"DocumentConverter",
|
|
||||||
"DocumentConverterResult",
|
|
||||||
"PlainTextConverter",
|
"PlainTextConverter",
|
||||||
"HtmlConverter",
|
"HtmlConverter",
|
||||||
"RssConverter",
|
"RssConverter",
|
||||||
@@ -37,8 +33,7 @@ __all__ = [
|
|||||||
"XlsConverter",
|
"XlsConverter",
|
||||||
"PptxConverter",
|
"PptxConverter",
|
||||||
"ImageConverter",
|
"ImageConverter",
|
||||||
"WavConverter",
|
"AudioConverter",
|
||||||
"Mp3Converter",
|
|
||||||
"OutlookMsgConverter",
|
"OutlookMsgConverter",
|
||||||
"ZipConverter",
|
"ZipConverter",
|
||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
|
|||||||
@@ -0,0 +1,102 @@
|
|||||||
|
import io
|
||||||
|
from typing import Any, BinaryIO, Optional
|
||||||
|
|
||||||
|
from ._exiftool import exiftool_metadata
|
||||||
|
from ._transcribe_audio import transcribe_audio
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
from .._exceptions import MissingDependencyException
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"audio/x-wav",
|
||||||
|
"audio/mpeg",
|
||||||
|
"video/mp4",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [
|
||||||
|
".wav",
|
||||||
|
".mp3",
|
||||||
|
".m4a",
|
||||||
|
".mp4",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class AudioConverter(DocumentConverter):
|
||||||
|
"""
|
||||||
|
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def accepts(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
md_content = ""
|
||||||
|
|
||||||
|
# Add metadata
|
||||||
|
metadata = exiftool_metadata(
|
||||||
|
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
||||||
|
)
|
||||||
|
if metadata:
|
||||||
|
for f in [
|
||||||
|
"Title",
|
||||||
|
"Artist",
|
||||||
|
"Author",
|
||||||
|
"Band",
|
||||||
|
"Album",
|
||||||
|
"Genre",
|
||||||
|
"Track",
|
||||||
|
"DateTimeOriginal",
|
||||||
|
"CreateDate",
|
||||||
|
# "Duration", -- Wrong values when read from memory
|
||||||
|
"NumChannels",
|
||||||
|
"SampleRate",
|
||||||
|
"AvgBytesPerSec",
|
||||||
|
"BitsPerSample",
|
||||||
|
]:
|
||||||
|
if f in metadata:
|
||||||
|
md_content += f"{f}: {metadata[f]}\n"
|
||||||
|
|
||||||
|
# Figure out the audio format for transcription
|
||||||
|
if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
|
||||||
|
audio_format = "wav"
|
||||||
|
elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
|
||||||
|
audio_format = "mp3"
|
||||||
|
elif (
|
||||||
|
stream_info.extension in [".mp4", ".m4a"]
|
||||||
|
or stream_info.mimetype == "video/mp4"
|
||||||
|
):
|
||||||
|
audio_format = "mp4"
|
||||||
|
else:
|
||||||
|
audio_format = None
|
||||||
|
|
||||||
|
# Transcribe
|
||||||
|
if audio_format:
|
||||||
|
try:
|
||||||
|
transcript = transcribe_audio(file_stream, audio_format=audio_format)
|
||||||
|
if transcript:
|
||||||
|
md_content += "\n\n### Audio Transcript:\n" + transcript
|
||||||
|
except MissingDependencyException:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Return the result
|
||||||
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
@@ -1,63 +0,0 @@
|
|||||||
from typing import Any, Union
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverterResult:
|
|
||||||
"""The result of converting a document to text."""
|
|
||||||
|
|
||||||
def __init__(self, title: Union[str, None] = None, text_content: str = ""):
|
|
||||||
self.title: Union[str, None] = title
|
|
||||||
self.text_content: str = text_content
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverter:
|
|
||||||
"""Abstract superclass of all DocumentConverters."""
|
|
||||||
|
|
||||||
# Lower priority values are tried first.
|
|
||||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
|
||||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
|
||||||
)
|
|
||||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
|
||||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
|
|
||||||
"""
|
|
||||||
Initialize the DocumentConverter with a given priority.
|
|
||||||
|
|
||||||
Priorities work as follows: By default, most converters get priority
|
|
||||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
|
||||||
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
|
|
||||||
with lower values being tried first (i.e., higher priority).
|
|
||||||
|
|
||||||
Just prior to conversion, the converters are sorted by priority, using
|
|
||||||
a stable sort. This means that converters with the same priority will
|
|
||||||
remain in the same order, with the most recently registered converters
|
|
||||||
appearing first.
|
|
||||||
|
|
||||||
We have tight control over the order of built-in converters, but
|
|
||||||
plugins can register converters in any order. A converter's priority
|
|
||||||
field reasserts some control over the order of converters.
|
|
||||||
|
|
||||||
Plugins can register converters with any priority, to appear before or
|
|
||||||
after the built-ins. For example, a plugin with priority 9 will run
|
|
||||||
before the PlainTextConverter, but after the built-in converters.
|
|
||||||
"""
|
|
||||||
self._priority = priority
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self, local_path: str, **kwargs: Any
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
raise NotImplementedError("Subclasses must implement this method")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def priority(self) -> float:
|
|
||||||
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
|
|
||||||
return self._priority
|
|
||||||
|
|
||||||
@priority.setter
|
|
||||||
def priority(self, value: float):
|
|
||||||
self._priority = value
|
|
||||||
|
|
||||||
@priority.deleter
|
|
||||||
def priority(self):
|
|
||||||
raise AttributeError("Cannot delete the priority attribute")
|
|
||||||
@@ -1,14 +1,24 @@
|
|||||||
# type: ignore
|
import io
|
||||||
import base64
|
|
||||||
import re
|
import re
|
||||||
|
import base64
|
||||||
from typing import Union
|
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
|
from typing import Any, BinaryIO, Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/html",
|
||||||
|
"application/xhtml",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class BingSerpConverter(DocumentConverter):
|
class BingSerpConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
@@ -16,28 +26,47 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
NOTE: It is better to use the Bing API
|
NOTE: It is better to use the Bing API
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def accepts(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
super().__init__(priority=priority)
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Make sure we're dealing with HTML content *from* Bing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
url = stream_info.url or ""
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Bail if not a Bing SERP
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
|
||||||
return None
|
|
||||||
url = kwargs.get("url", "")
|
|
||||||
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
||||||
return None
|
# Not a Bing SERP URL
|
||||||
|
return False
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Not HTML content
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Parse the query parameters
|
# Parse the query parameters
|
||||||
parsed_params = parse_qs(urlparse(url).query)
|
parsed_params = parse_qs(urlparse(stream_info.url).query)
|
||||||
query = parsed_params.get("q", [""])[0]
|
query = parsed_params.get("q", [""])[0]
|
||||||
|
|
||||||
# Parse the file
|
# Parse the stream
|
||||||
soup = None
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
|
||||||
|
|
||||||
# Clean up some formatting
|
# Clean up some formatting
|
||||||
for tptt in soup.find_all(class_="tptt"):
|
for tptt in soup.find_all(class_="tptt"):
|
||||||
@@ -81,6 +110,6 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
)
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=webpage_text,
|
||||||
title=None if soup.title is None else soup.title.string,
|
title=None if soup.title is None else soup.title.string,
|
||||||
text_content=webpage_text,
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
from typing import Any, Union
|
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from typing import BinaryIO, Any, List
|
||||||
from .._exceptions import MissingDependencyException
|
|
||||||
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
@@ -26,17 +29,50 @@ except ImportError:
|
|||||||
CONTENT_FORMAT = "markdown"
|
CONTENT_FORMAT = "markdown"
|
||||||
|
|
||||||
|
|
||||||
|
OFFICE_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml",
|
||||||
|
"application/xhtml",
|
||||||
|
"text/html",
|
||||||
|
]
|
||||||
|
|
||||||
|
OTHER_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/pdf",
|
||||||
|
"application/x-pdf",
|
||||||
|
"text/html",
|
||||||
|
"image/",
|
||||||
|
]
|
||||||
|
|
||||||
|
OFFICE_FILE_EXTENSIONS = [
|
||||||
|
".docx",
|
||||||
|
".xlsx",
|
||||||
|
".pptx",
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
]
|
||||||
|
|
||||||
|
OTHER_FILE_EXTENSIONS = [
|
||||||
|
".pdf",
|
||||||
|
".jpeg",
|
||||||
|
".jpg",
|
||||||
|
".png",
|
||||||
|
".bmp",
|
||||||
|
".tiff",
|
||||||
|
".heif",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class DocumentIntelligenceConverter(DocumentConverter):
|
class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
|
|
||||||
endpoint: str,
|
endpoint: str,
|
||||||
api_version: str = "2024-07-31-preview",
|
api_version: str = "2024-07-31-preview",
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__()
|
||||||
|
|
||||||
# Raise an error if the dependencies are not available.
|
# Raise an error if the dependencies are not available.
|
||||||
# This is different than other converters since this one isn't even instantiated
|
# This is different than other converters since this one isn't even instantiated
|
||||||
@@ -44,9 +80,11 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
|
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
|
||||||
) from _dependency_exc_info[1].with_traceback(
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
self.endpoint = endpoint
|
self.endpoint = endpoint
|
||||||
self.api_version = api_version
|
self.api_version = api_version
|
||||||
@@ -55,55 +93,62 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
api_version=self.api_version,
|
api_version=self.api_version,
|
||||||
credential=DefaultAzureCredential(),
|
credential=DefaultAzureCredential(),
|
||||||
)
|
)
|
||||||
self._priority = priority
|
|
||||||
|
def accepts(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
|
||||||
|
"""
|
||||||
|
Helper needed to determine which analysis features to use.
|
||||||
|
Certain document analysis features are not availiable for
|
||||||
|
office filetypes (.xlsx, .pptx, .html, .docx)
|
||||||
|
"""
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in OFFICE_FILE_EXTENSIONS:
|
||||||
|
return []
|
||||||
|
|
||||||
|
for prefix in OFFICE_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [
|
||||||
|
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
||||||
|
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
||||||
|
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
|
||||||
|
]
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if extension is not supported by Document Intelligence
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
docintel_extensions = [
|
) -> DocumentConverterResult:
|
||||||
".pdf",
|
|
||||||
".docx",
|
|
||||||
".xlsx",
|
|
||||||
".pptx",
|
|
||||||
".html",
|
|
||||||
".jpeg",
|
|
||||||
".jpg",
|
|
||||||
".png",
|
|
||||||
".bmp",
|
|
||||||
".tiff",
|
|
||||||
".heif",
|
|
||||||
]
|
|
||||||
if extension.lower() not in docintel_extensions:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Get the bytestring for the local path
|
|
||||||
with open(local_path, "rb") as f:
|
|
||||||
file_bytes = f.read()
|
|
||||||
|
|
||||||
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
|
|
||||||
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
|
|
||||||
analysis_features = []
|
|
||||||
else:
|
|
||||||
analysis_features = [
|
|
||||||
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
|
||||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
|
||||||
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
|
|
||||||
]
|
|
||||||
|
|
||||||
# Extract the text using Azure Document Intelligence
|
# Extract the text using Azure Document Intelligence
|
||||||
poller = self.doc_intel_client.begin_analyze_document(
|
poller = self.doc_intel_client.begin_analyze_document(
|
||||||
model_id="prebuilt-layout",
|
model_id="prebuilt-layout",
|
||||||
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
|
body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
|
||||||
features=analysis_features,
|
features=self._analysis_features(stream_info),
|
||||||
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
||||||
)
|
)
|
||||||
result: AnalyzeResult = poller.result()
|
result: AnalyzeResult = poller.result()
|
||||||
|
|
||||||
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
|
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
|
||||||
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
|
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=markdown_text)
|
||||||
title=None,
|
|
||||||
text_content=markdown_text,
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -1,13 +1,10 @@
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
from typing import Union
|
from typing import BinaryIO, Any
|
||||||
|
|
||||||
from ._base import (
|
|
||||||
DocumentConverterResult,
|
|
||||||
)
|
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
@@ -20,22 +17,46 @@ except ImportError:
|
|||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".docx"]
|
||||||
|
|
||||||
|
|
||||||
class DocxConverter(HtmlConverter):
|
class DocxConverter(HtmlConverter):
|
||||||
"""
|
"""
|
||||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self):
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
super().__init__()
|
||||||
):
|
self._html_converter = HtmlConverter()
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def accepts(
|
||||||
# Bail if not a DOCX
|
self,
|
||||||
extension = kwargs.get("file_extension", "")
|
file_stream: BinaryIO,
|
||||||
if extension.lower() != ".docx":
|
stream_info: StreamInfo,
|
||||||
return None
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Check: the dependencies
|
# Check: the dependencies
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
@@ -44,16 +65,13 @@ class DocxConverter(HtmlConverter):
|
|||||||
extension=".docx",
|
extension=".docx",
|
||||||
feature="docx",
|
feature="docx",
|
||||||
)
|
)
|
||||||
) from _dependency_exc_info[1].with_traceback(
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
result = None
|
style_map = kwargs.get("style_map", None)
|
||||||
with open(local_path, "rb") as docx_file:
|
return self._html_converter.convert_string(
|
||||||
style_map = kwargs.get("style_map", None)
|
mammoth.convert_to_html(file_stream, style_map=style_map).value
|
||||||
|
)
|
||||||
result = mammoth.convert_to_html(docx_file, style_map=style_map)
|
|
||||||
html_content = result.value
|
|
||||||
result = self._convert(html_content)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|||||||
44
packages/markitdown/src/markitdown/converters/_exiftool.py
Normal file
44
packages/markitdown/src/markitdown/converters/_exiftool.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import locale
|
||||||
|
import sys
|
||||||
|
import shutil
|
||||||
|
import os
|
||||||
|
import warnings
|
||||||
|
from typing import BinaryIO, Optional, Any
|
||||||
|
|
||||||
|
|
||||||
|
def exiftool_metadata(
|
||||||
|
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
|
||||||
|
) -> Any: # Need a better type for json data
|
||||||
|
# Check if we have a valid pointer to exiftool
|
||||||
|
if not exiftool_path:
|
||||||
|
which_exiftool = shutil.which("exiftool")
|
||||||
|
if which_exiftool:
|
||||||
|
warnings.warn(
|
||||||
|
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
||||||
|
|
||||||
|
md = MarkItDown(exiftool_path="{which_exiftool}")
|
||||||
|
|
||||||
|
This warning will be removed in future releases.
|
||||||
|
""",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
# Nothing to do
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Run exiftool
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
try:
|
||||||
|
output = subprocess.run(
|
||||||
|
[exiftool_path, "-json", "-"],
|
||||||
|
input=file_stream.read(),
|
||||||
|
capture_output=True,
|
||||||
|
text=False,
|
||||||
|
).stdout
|
||||||
|
|
||||||
|
return json.loads(
|
||||||
|
output.decode(locale.getpreferredencoding(False)),
|
||||||
|
)[0]
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
@@ -1,37 +1,52 @@
|
|||||||
from typing import Any, Union
|
import io
|
||||||
|
from typing import Any, BinaryIO, Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/html",
|
||||||
|
"application/xhtml",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class HtmlConverter(DocumentConverter):
|
class HtmlConverter(DocumentConverter):
|
||||||
"""Anything with content type text/html"""
|
"""Anything with content type text/html"""
|
||||||
|
|
||||||
def __init__(
|
def accepts(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
super().__init__(priority=priority)
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if not html
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
) -> DocumentConverterResult:
|
||||||
return None
|
# Parse the stream
|
||||||
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
result = None
|
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
result = self._convert(fh.read())
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
|
||||||
"""Helper function that converts an HTML string."""
|
|
||||||
|
|
||||||
# Parse the string
|
|
||||||
soup = BeautifulSoup(html_content, "html.parser")
|
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
@@ -51,6 +66,25 @@ class HtmlConverter(DocumentConverter):
|
|||||||
webpage_text = webpage_text.strip()
|
webpage_text = webpage_text.strip()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=webpage_text,
|
||||||
title=None if soup.title is None else soup.title.string,
|
title=None if soup.title is None else soup.title.string,
|
||||||
text_content=webpage_text,
|
)
|
||||||
|
|
||||||
|
def convert_string(
|
||||||
|
self, html_content: str, *, url: Optional[str] = None, **kwargs
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
"""
|
||||||
|
Non-standard convenience method to convert a string to markdown.
|
||||||
|
Given that many converters produce HTML as intermediate output, this
|
||||||
|
allows for easy conversion of HTML to markdown.
|
||||||
|
"""
|
||||||
|
return self.convert(
|
||||||
|
file_stream=io.BytesIO(html_content.encode("utf-8")),
|
||||||
|
stream_info=StreamInfo(
|
||||||
|
mimetype="text/html",
|
||||||
|
extension=".html",
|
||||||
|
charset="utf-8",
|
||||||
|
url=url,
|
||||||
|
),
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,30 +1,53 @@
|
|||||||
from typing import Union
|
from typing import BinaryIO, Any, Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
|
||||||
from ._media_converter import MediaConverter
|
|
||||||
import base64
|
import base64
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
from ._exiftool import exiftool_metadata
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"image/jpeg",
|
||||||
|
"image/png",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"]
|
||||||
|
|
||||||
|
|
||||||
class ImageConverter(MediaConverter):
|
class ImageConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def accepts(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
super().__init__(priority=priority)
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
# Bail if not an image
|
return True
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
# Add metadata
|
||||||
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
metadata = exiftool_metadata(
|
||||||
|
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
||||||
|
)
|
||||||
|
|
||||||
if metadata:
|
if metadata:
|
||||||
for f in [
|
for f in [
|
||||||
@@ -42,39 +65,59 @@ class ImageConverter(MediaConverter):
|
|||||||
if f in metadata:
|
if f in metadata:
|
||||||
md_content += f"{f}: {metadata[f]}\n"
|
md_content += f"{f}: {metadata[f]}\n"
|
||||||
|
|
||||||
# Try describing the image with GPTV
|
# Try describing the image with GPT
|
||||||
llm_client = kwargs.get("llm_client")
|
llm_client = kwargs.get("llm_client")
|
||||||
llm_model = kwargs.get("llm_model")
|
llm_model = kwargs.get("llm_model")
|
||||||
if llm_client is not None and llm_model is not None:
|
if llm_client is not None and llm_model is not None:
|
||||||
md_content += (
|
llm_description = self._get_llm_description(
|
||||||
"\n# Description:\n"
|
file_stream,
|
||||||
+ self._get_llm_description(
|
stream_info,
|
||||||
local_path,
|
client=llm_client,
|
||||||
extension,
|
model=llm_model,
|
||||||
llm_client,
|
prompt=kwargs.get("llm_prompt"),
|
||||||
llm_model,
|
|
||||||
prompt=kwargs.get("llm_prompt"),
|
|
||||||
).strip()
|
|
||||||
+ "\n"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if llm_description is not None:
|
||||||
|
md_content += "\n# Description:\n" + llm_description.strip() + "\n"
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
markdown=md_content,
|
||||||
text_content=md_content,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
|
def _get_llm_description(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
*,
|
||||||
|
client,
|
||||||
|
model,
|
||||||
|
prompt=None,
|
||||||
|
) -> Union[None, str]:
|
||||||
if prompt is None or prompt.strip() == "":
|
if prompt is None or prompt.strip() == "":
|
||||||
prompt = "Write a detailed caption for this image."
|
prompt = "Write a detailed caption for this image."
|
||||||
|
|
||||||
data_uri = ""
|
# Get the content type
|
||||||
with open(local_path, "rb") as image_file:
|
content_type = stream_info.mimetype
|
||||||
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
if not content_type:
|
||||||
if content_type is None:
|
content_type, _ = mimetypes.guess_type(
|
||||||
content_type = "image/jpeg"
|
"_dummy" + (stream_info.extension or "")
|
||||||
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
)
|
||||||
data_uri = f"data:{content_type};base64,{image_base64}"
|
if not content_type:
|
||||||
|
content_type = "application/octet-stream"
|
||||||
|
|
||||||
|
# Convert to base64
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
try:
|
||||||
|
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
# Prepare the data-uri
|
||||||
|
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||||
|
|
||||||
|
# Prepare the OpenAI API request
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -90,5 +133,6 @@ class ImageConverter(MediaConverter):
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Call the OpenAI API
|
||||||
response = client.chat.completions.create(model=model, messages=messages)
|
response = client.chat.completions.create(model=model, messages=messages)
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|||||||
@@ -1,39 +1,62 @@
|
|||||||
|
from typing import BinaryIO, Any
|
||||||
import json
|
import json
|
||||||
from typing import Any, Union
|
|
||||||
|
|
||||||
from ._base import (
|
|
||||||
DocumentConverter,
|
|
||||||
DocumentConverterResult,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._exceptions import FileConversionException
|
from .._exceptions import FileConversionException
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
|
CANDIDATE_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/json",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
|
||||||
|
|
||||||
|
|
||||||
class IpynbConverter(DocumentConverter):
|
class IpynbConverter(DocumentConverter):
|
||||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||||
|
|
||||||
def __init__(
|
def accepts(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
super().__init__(priority=priority)
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
# Read further to see if it's a notebook
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
try:
|
||||||
|
encoding = stream_info.charset or "utf-8"
|
||||||
|
notebook_content = file_stream.read().decode(encoding)
|
||||||
|
return (
|
||||||
|
"nbformat" in notebook_content
|
||||||
|
and "nbformat_minor" in notebook_content
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if not ipynb
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
if extension.lower() != ".ipynb":
|
) -> DocumentConverterResult:
|
||||||
return None
|
|
||||||
|
|
||||||
# Parse and convert the notebook
|
# Parse and convert the notebook
|
||||||
result = None
|
result = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
notebook_content = json.load(fh)
|
|
||||||
result = self._convert(notebook_content)
|
|
||||||
|
|
||||||
return result
|
encoding = stream_info.charset or "utf-8"
|
||||||
|
notebook_content = file_stream.read().decode(encoding=encoding)
|
||||||
|
return self._convert(json.loads(notebook_content))
|
||||||
|
|
||||||
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
|
def _convert(self, notebook_content: dict) -> DocumentConverterResult:
|
||||||
"""Helper function that converts notebook JSON content to Markdown."""
|
"""Helper function that converts notebook JSON content to Markdown."""
|
||||||
try:
|
try:
|
||||||
md_output = []
|
md_output = []
|
||||||
@@ -65,8 +88,8 @@ class IpynbConverter(DocumentConverter):
|
|||||||
title = notebook_content.get("metadata", {}).get("title", title)
|
title = notebook_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=md_text,
|
||||||
title=title,
|
title=title,
|
||||||
text_content=md_text,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -0,0 +1,50 @@
|
|||||||
|
from typing import BinaryIO, Any, Union
|
||||||
|
import base64
|
||||||
|
import mimetypes
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
|
|
||||||
|
def llm_caption(
|
||||||
|
file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None
|
||||||
|
) -> Union[None, str]:
|
||||||
|
if prompt is None or prompt.strip() == "":
|
||||||
|
prompt = "Write a detailed caption for this image."
|
||||||
|
|
||||||
|
# Get the content type
|
||||||
|
content_type = stream_info.mimetype
|
||||||
|
if not content_type:
|
||||||
|
content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
|
||||||
|
if not content_type:
|
||||||
|
content_type = "application/octet-stream"
|
||||||
|
|
||||||
|
# Convert to base64
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
try:
|
||||||
|
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
# Prepare the data-uri
|
||||||
|
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||||
|
|
||||||
|
# Prepare the OpenAI API request
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": prompt},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": data_uri,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Call the OpenAI API
|
||||||
|
response = client.chat.completions.create(model=model, messages=messages)
|
||||||
|
return response.choices[0].message.content
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
import markdownify
|
import markdownify
|
||||||
|
|
||||||
from typing import Any
|
from typing import Any, Optional
|
||||||
from urllib.parse import quote, unquote, urlparse, urlunparse
|
from urllib.parse import quote, unquote, urlparse, urlunparse
|
||||||
|
|
||||||
|
|
||||||
@@ -20,7 +20,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
# Explicitly cast options to the expected type if necessary
|
# Explicitly cast options to the expected type if necessary
|
||||||
super().__init__(**options)
|
super().__init__(**options)
|
||||||
|
|
||||||
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
|
def convert_hn(
|
||||||
|
self,
|
||||||
|
n: int,
|
||||||
|
el: Any,
|
||||||
|
text: str,
|
||||||
|
convert_as_inline: Optional[bool] = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> str:
|
||||||
"""Same as usual, but be sure to start with a new line"""
|
"""Same as usual, but be sure to start with a new line"""
|
||||||
if not convert_as_inline:
|
if not convert_as_inline:
|
||||||
if not re.search(r"^\n", text):
|
if not re.search(r"^\n", text):
|
||||||
@@ -28,7 +35,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
|
|
||||||
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
||||||
|
|
||||||
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
|
def convert_a(
|
||||||
|
self,
|
||||||
|
el: Any,
|
||||||
|
text: str,
|
||||||
|
convert_as_inline: Optional[bool] = False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
||||||
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
||||||
if not text:
|
if not text:
|
||||||
@@ -68,7 +81,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
else text
|
else text
|
||||||
)
|
)
|
||||||
|
|
||||||
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
|
def convert_img(
|
||||||
|
self,
|
||||||
|
el: Any,
|
||||||
|
text: str,
|
||||||
|
convert_as_inline: Optional[bool] = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> str:
|
||||||
"""Same as usual converter, but removes data URIs"""
|
"""Same as usual converter, but removes data URIs"""
|
||||||
|
|
||||||
alt = el.attrs.get("alt", None) or ""
|
alt = el.attrs.get("alt", None) or ""
|
||||||
|
|||||||
@@ -1,41 +0,0 @@
|
|||||||
import subprocess
|
|
||||||
import shutil
|
|
||||||
import json
|
|
||||||
from warnings import warn
|
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
|
||||||
|
|
||||||
|
|
||||||
class MediaConverter(DocumentConverter):
|
|
||||||
"""
|
|
||||||
Abstract class for multi-modal media (e.g., images and audio)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def _get_metadata(self, local_path, exiftool_path=None):
|
|
||||||
if not exiftool_path:
|
|
||||||
which_exiftool = shutil.which("exiftool")
|
|
||||||
if which_exiftool:
|
|
||||||
warn(
|
|
||||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
|
||||||
|
|
||||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
|
||||||
|
|
||||||
This warning will be removed in future releases.
|
|
||||||
""",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
if True:
|
|
||||||
result = subprocess.run(
|
|
||||||
[exiftool_path, "-json", local_path], capture_output=True, text=True
|
|
||||||
).stdout
|
|
||||||
return json.loads(result)[0]
|
|
||||||
# except Exception:
|
|
||||||
# return None
|
|
||||||
@@ -1,89 +0,0 @@
|
|||||||
import tempfile
|
|
||||||
from typing import Union
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
|
||||||
from ._wav_converter import WavConverter
|
|
||||||
from warnings import resetwarnings, catch_warnings
|
|
||||||
|
|
||||||
# Optional Transcription support
|
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
|
||||||
try:
|
|
||||||
# Using warnings' catch_warnings to catch
|
|
||||||
# pydub's warning of ffmpeg or avconv missing
|
|
||||||
with catch_warnings(record=True) as w:
|
|
||||||
import pydub
|
|
||||||
|
|
||||||
if w:
|
|
||||||
raise ModuleNotFoundError
|
|
||||||
import speech_recognition as sr
|
|
||||||
|
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
resetwarnings()
|
|
||||||
|
|
||||||
|
|
||||||
class Mp3Converter(WavConverter):
|
|
||||||
"""
|
|
||||||
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Bail if not a MP3
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() != ".mp3":
|
|
||||||
return None
|
|
||||||
|
|
||||||
md_content = ""
|
|
||||||
|
|
||||||
# Add metadata
|
|
||||||
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
|
||||||
if metadata:
|
|
||||||
for f in [
|
|
||||||
"Title",
|
|
||||||
"Artist",
|
|
||||||
"Author",
|
|
||||||
"Band",
|
|
||||||
"Album",
|
|
||||||
"Genre",
|
|
||||||
"Track",
|
|
||||||
"DateTimeOriginal",
|
|
||||||
"CreateDate",
|
|
||||||
"Duration",
|
|
||||||
]:
|
|
||||||
if f in metadata:
|
|
||||||
md_content += f"{f}: {metadata[f]}\n"
|
|
||||||
|
|
||||||
# Transcribe
|
|
||||||
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
|
||||||
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
|
||||||
os.close(handle)
|
|
||||||
try:
|
|
||||||
sound = pydub.AudioSegment.from_mp3(local_path)
|
|
||||||
sound.export(temp_path, format="wav")
|
|
||||||
|
|
||||||
_args = dict()
|
|
||||||
_args.update(kwargs)
|
|
||||||
_args["file_extension"] = ".wav"
|
|
||||||
|
|
||||||
try:
|
|
||||||
transcript = super()._transcribe_audio(temp_path).strip()
|
|
||||||
md_content += "\n\n### Audio Transcript:\n" + (
|
|
||||||
"[No speech detected]" if transcript == "" else transcript
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
|
||||||
|
|
||||||
finally:
|
|
||||||
os.unlink(temp_path)
|
|
||||||
|
|
||||||
# Return the result
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None,
|
|
||||||
text_content=md_content.strip(),
|
|
||||||
)
|
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import sys
|
import sys
|
||||||
from typing import Any, Union
|
from typing import Any, Union, BinaryIO
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from .._stream_info import StreamInfo
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
@@ -12,6 +13,12 @@ except ImportError:
|
|||||||
# Preserve the error and stack trace for later
|
# Preserve the error and stack trace for later
|
||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.ms-outlook",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".msg"]
|
||||||
|
|
||||||
|
|
||||||
class OutlookMsgConverter(DocumentConverter):
|
class OutlookMsgConverter(DocumentConverter):
|
||||||
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
||||||
@@ -21,19 +28,52 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
- Email body content
|
- Email body content
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def accepts(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
super().__init__(priority=priority)
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
# Check the extension and mimetype
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Brute force, check if we have an OLE file
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
try:
|
||||||
|
if not olefile.isOleFile(file_stream):
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
# Brue force, check if it's an Outlook file
|
||||||
|
try:
|
||||||
|
msg = olefile.OleFileIO(file_stream)
|
||||||
|
toc = "\n".join([str(stream) for stream in msg.listdir()])
|
||||||
|
return (
|
||||||
|
"__properties_version1.0" in toc
|
||||||
|
and "__recip_version1.0_#00000000" in toc
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if not a MSG file
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
if extension.lower() != ".msg":
|
) -> DocumentConverterResult:
|
||||||
return None
|
|
||||||
|
|
||||||
# Check: the dependencies
|
# Check: the dependencies
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
@@ -42,44 +82,41 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
extension=".msg",
|
extension=".msg",
|
||||||
feature="outlook",
|
feature="outlook",
|
||||||
)
|
)
|
||||||
) from _dependency_exc_info[1].with_traceback(
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
|
||||||
|
|
||||||
try:
|
|
||||||
msg = olefile.OleFileIO(local_path)
|
|
||||||
# Extract email metadata
|
|
||||||
md_content = "# Email Message\n\n"
|
|
||||||
|
|
||||||
# Get headers
|
|
||||||
headers = {
|
|
||||||
"From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
|
|
||||||
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
|
|
||||||
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add headers to markdown
|
|
||||||
for key, value in headers.items():
|
|
||||||
if value:
|
|
||||||
md_content += f"**{key}:** {value}\n"
|
|
||||||
|
|
||||||
md_content += "\n## Content\n\n"
|
|
||||||
|
|
||||||
# Get email body
|
|
||||||
body = self._get_stream_data(msg, "__substg1.0_1000001F")
|
|
||||||
if body:
|
|
||||||
md_content += body
|
|
||||||
|
|
||||||
msg.close()
|
|
||||||
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=headers.get("Subject"), text_content=md_content.strip()
|
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
msg = olefile.OleFileIO(file_stream)
|
||||||
raise FileConversionException(
|
# Extract email metadata
|
||||||
f"Could not convert MSG file '{local_path}': {str(e)}"
|
md_content = "# Email Message\n\n"
|
||||||
)
|
|
||||||
|
# Get headers
|
||||||
|
headers = {
|
||||||
|
"From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
|
||||||
|
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
|
||||||
|
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add headers to markdown
|
||||||
|
for key, value in headers.items():
|
||||||
|
if value:
|
||||||
|
md_content += f"**{key}:** {value}\n"
|
||||||
|
|
||||||
|
md_content += "\n## Content\n\n"
|
||||||
|
|
||||||
|
# Get email body
|
||||||
|
body = self._get_stream_data(msg, "__substg1.0_1000001F")
|
||||||
|
if body:
|
||||||
|
md_content += body
|
||||||
|
|
||||||
|
msg.close()
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
markdown=md_content.strip(),
|
||||||
|
title=headers.get("Subject"),
|
||||||
|
)
|
||||||
|
|
||||||
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
||||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||||
|
|||||||
@@ -1,8 +1,15 @@
|
|||||||
import sys
|
import sys
|
||||||
from typing import Union
|
import io
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
|
||||||
|
from typing import BinaryIO, Any
|
||||||
|
|
||||||
|
|
||||||
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
_dependency_exc_info = None
|
_dependency_exc_info = None
|
||||||
@@ -14,22 +21,43 @@ except ImportError:
|
|||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/pdf",
|
||||||
|
"application/x-pdf",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
|
||||||
|
|
||||||
|
|
||||||
class PdfConverter(DocumentConverter):
|
class PdfConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def accepts(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
super().__init__(priority=priority)
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
# Bail if not a PDF
|
return True
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() != ".pdf":
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Check the dependencies
|
# Check the dependencies
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
@@ -38,11 +66,13 @@ class PdfConverter(DocumentConverter):
|
|||||||
extension=".pdf",
|
extension=".pdf",
|
||||||
feature="pdf",
|
feature="pdf",
|
||||||
)
|
)
|
||||||
) from _dependency_exc_info[1].with_traceback(
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
|
assert isinstance(file_stream, io.IOBase) # for mypy
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
markdown=pdfminer.high_level.extract_text(file_stream),
|
||||||
text_content=pdfminer.high_level.extract_text(local_path),
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,13 +1,26 @@
|
|||||||
import mimetypes
|
import sys
|
||||||
|
|
||||||
from charset_normalizer import from_path
|
from typing import BinaryIO, Any
|
||||||
from typing import Any, Union
|
from charset_normalizer import from_bytes
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
# Save reporting of any exceptions for later
|
||||||
|
_dependency_exc_info = None
|
||||||
|
try:
|
||||||
|
import mammoth
|
||||||
|
except ImportError:
|
||||||
|
# Preserve the error and stack trace for later
|
||||||
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/",
|
||||||
|
"application/json",
|
||||||
|
]
|
||||||
|
|
||||||
# Mimetypes to ignore (commonly confused extensions)
|
# Mimetypes to ignore (commonly confused extensions)
|
||||||
IGNORE_MIMETYPES = [
|
IGNORE_MIME_TYPE_PREFIXES = [
|
||||||
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
|
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
|
||||||
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
|
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
|
||||||
]
|
]
|
||||||
@@ -16,34 +29,34 @@ IGNORE_MIMETYPES = [
|
|||||||
class PlainTextConverter(DocumentConverter):
|
class PlainTextConverter(DocumentConverter):
|
||||||
"""Anything with content type text/plain"""
|
"""Anything with content type text/plain"""
|
||||||
|
|
||||||
def __init__(
|
def accepts(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
super().__init__(priority=priority)
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
for prefix in IGNORE_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return False
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Guess the content type from any file extension that might be around
|
stream_info: StreamInfo,
|
||||||
content_type, _ = mimetypes.guess_type(
|
**kwargs: Any, # Options to pass to the converter
|
||||||
"__placeholder" + kwargs.get("file_extension", "")
|
) -> DocumentConverterResult:
|
||||||
)
|
if stream_info.charset:
|
||||||
|
text_content = file_stream.read().decode(stream_info.charset)
|
||||||
|
else:
|
||||||
|
text_content = str(from_bytes(file_stream.read()).best())
|
||||||
|
|
||||||
# Ignore common false positives
|
return DocumentConverterResult(markdown=text_content)
|
||||||
if content_type in IGNORE_MIMETYPES:
|
|
||||||
content_type = None
|
|
||||||
|
|
||||||
# Only accept text files
|
|
||||||
if content_type is None:
|
|
||||||
return None
|
|
||||||
elif all(
|
|
||||||
not content_type.lower().startswith(type_prefix)
|
|
||||||
for type_prefix in ["text/", "application/json"]
|
|
||||||
):
|
|
||||||
return None
|
|
||||||
|
|
||||||
text_content = str(from_path(local_path).best())
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None,
|
|
||||||
text_content=text_content,
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -1,12 +1,16 @@
|
|||||||
|
import sys
|
||||||
import base64
|
import base64
|
||||||
|
import os
|
||||||
|
import io
|
||||||
import re
|
import re
|
||||||
import html
|
import html
|
||||||
import sys
|
|
||||||
|
|
||||||
from typing import Union
|
from typing import BinaryIO, Any
|
||||||
|
|
||||||
from ._base import DocumentConverterResult, DocumentConverter
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from ._llm_caption import llm_caption
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
@@ -19,51 +23,46 @@ except ImportError:
|
|||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
class PptxConverter(HtmlConverter):
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".pptx"]
|
||||||
|
|
||||||
|
|
||||||
|
class PptxConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self):
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
super().__init__()
|
||||||
):
|
self._html_converter = HtmlConverter()
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def _get_llm_description(
|
def accepts(
|
||||||
self, llm_client, llm_model, image_blob, content_type, prompt=None
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
if prompt is None or prompt.strip() == "":
|
stream_info: StreamInfo,
|
||||||
prompt = "Write a detailed alt text for this image with less than 50 words."
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
image_base64 = base64.b64encode(image_blob).decode("utf-8")
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
data_uri = f"data:{content_type};base64,{image_base64}"
|
return True
|
||||||
|
|
||||||
messages = [
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
{
|
if mimetype.startswith(prefix):
|
||||||
"role": "user",
|
return True
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "image_url",
|
|
||||||
"image_url": {
|
|
||||||
"url": data_uri,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{"type": "text", "text": prompt},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
response = llm_client.chat.completions.create(
|
return False
|
||||||
model=llm_model, messages=messages
|
|
||||||
)
|
|
||||||
return response.choices[0].message.content
|
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Bail if not a PPTX
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() != ".pptx":
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Check the dependencies
|
# Check the dependencies
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
@@ -72,11 +71,14 @@ class PptxConverter(HtmlConverter):
|
|||||||
extension=".pptx",
|
extension=".pptx",
|
||||||
feature="pptx",
|
feature="pptx",
|
||||||
)
|
)
|
||||||
) from _dependency_exc_info[1].with_traceback(
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
presentation = pptx.Presentation(local_path)
|
# Perform the conversion
|
||||||
|
presentation = pptx.Presentation(file_stream)
|
||||||
md_content = ""
|
md_content = ""
|
||||||
slide_num = 0
|
slide_num = 0
|
||||||
for slide in presentation.slides:
|
for slide in presentation.slides:
|
||||||
@@ -92,59 +94,58 @@ class PptxConverter(HtmlConverter):
|
|||||||
if self._is_picture(shape):
|
if self._is_picture(shape):
|
||||||
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
|
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
|
||||||
|
|
||||||
llm_description = None
|
llm_description = ""
|
||||||
alt_text = None
|
alt_text = ""
|
||||||
|
|
||||||
|
# Potentially generate a description using an LLM
|
||||||
llm_client = kwargs.get("llm_client")
|
llm_client = kwargs.get("llm_client")
|
||||||
llm_model = kwargs.get("llm_model")
|
llm_model = kwargs.get("llm_model")
|
||||||
if llm_client is not None and llm_model is not None:
|
if llm_client is not None and llm_model is not None:
|
||||||
|
# Prepare a file_stream and stream_info for the image data
|
||||||
|
image_filename = shape.image.filename
|
||||||
|
image_extension = None
|
||||||
|
if image_filename:
|
||||||
|
image_extension = os.path.splitext(image_filename)[1]
|
||||||
|
image_stream_info = StreamInfo(
|
||||||
|
mimetype=shape.image.content_type,
|
||||||
|
extension=image_extension,
|
||||||
|
filename=image_filename,
|
||||||
|
)
|
||||||
|
|
||||||
|
image_stream = io.BytesIO(shape.image.blob)
|
||||||
|
|
||||||
|
# Caption the image
|
||||||
try:
|
try:
|
||||||
llm_description = self._get_llm_description(
|
llm_description = llm_caption(
|
||||||
llm_client,
|
image_stream,
|
||||||
llm_model,
|
image_stream_info,
|
||||||
shape.image.blob,
|
client=llm_client,
|
||||||
shape.image.content_type,
|
model=llm_model,
|
||||||
|
prompt=kwargs.get("llm_prompt"),
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
# Unable to describe with LLM
|
# Unable to generate a description
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if not llm_description:
|
# Also grab any description embedded in the deck
|
||||||
try:
|
try:
|
||||||
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
|
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
|
||||||
"descr", ""
|
except Exception:
|
||||||
)
|
# Unable to get alt text
|
||||||
except Exception:
|
pass
|
||||||
# Unable to get alt text
|
|
||||||
pass
|
# Prepare the alt, escaping any special characters
|
||||||
|
alt_text = "\n".join([llm_description, alt_text]) or shape.name
|
||||||
|
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
|
||||||
|
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
||||||
|
|
||||||
# A placeholder name
|
# A placeholder name
|
||||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||||
md_content += (
|
md_content += "\n\n"
|
||||||
"\n\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
if self._is_table(shape):
|
if self._is_table(shape):
|
||||||
html_table = "<html><body><table>"
|
md_content += self._convert_table_to_markdown(shape.table)
|
||||||
first_row = True
|
|
||||||
for row in shape.table.rows:
|
|
||||||
html_table += "<tr>"
|
|
||||||
for cell in row.cells:
|
|
||||||
if first_row:
|
|
||||||
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
|
||||||
else:
|
|
||||||
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
|
||||||
html_table += "</tr>"
|
|
||||||
first_row = False
|
|
||||||
html_table += "</table></body></html>"
|
|
||||||
md_content += (
|
|
||||||
"\n" + self._convert(html_table).text_content.strip() + "\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Charts
|
# Charts
|
||||||
if shape.has_chart:
|
if shape.has_chart:
|
||||||
@@ -174,10 +175,7 @@ class PptxConverter(HtmlConverter):
|
|||||||
md_content += notes_frame.text
|
md_content += notes_frame.text
|
||||||
md_content = md_content.strip()
|
md_content = md_content.strip()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
title=None,
|
|
||||||
text_content=md_content.strip(),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _is_picture(self, shape):
|
def _is_picture(self, shape):
|
||||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
||||||
@@ -192,6 +190,23 @@ class PptxConverter(HtmlConverter):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _convert_table_to_markdown(self, table):
|
||||||
|
# Write the table as HTML, then convert it to Markdown
|
||||||
|
html_table = "<html><body><table>"
|
||||||
|
first_row = True
|
||||||
|
for row in table.rows:
|
||||||
|
html_table += "<tr>"
|
||||||
|
for cell in row.cells:
|
||||||
|
if first_row:
|
||||||
|
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
||||||
|
else:
|
||||||
|
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
||||||
|
html_table += "</tr>"
|
||||||
|
first_row = False
|
||||||
|
html_table += "</table></body></html>"
|
||||||
|
|
||||||
|
return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
|
||||||
|
|
||||||
def _convert_chart_to_markdown(self, chart):
|
def _convert_chart_to_markdown(self, chart):
|
||||||
md = "\n\n### Chart"
|
md = "\n\n### Chart"
|
||||||
if chart.has_title:
|
if chart.has_title:
|
||||||
|
|||||||
@@ -1,128 +1,165 @@
|
|||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
from typing import Union
|
from typing import BinaryIO, Any, Union
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from .._stream_info import StreamInfo
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
|
PRECISE_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/rss",
|
||||||
|
"application/atom",
|
||||||
|
]
|
||||||
|
|
||||||
|
PRECISE_FILE_EXTENSIONS = [".rss", ".atom"]
|
||||||
|
|
||||||
|
CANDIDATE_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/xml",
|
||||||
|
"application/xml",
|
||||||
|
]
|
||||||
|
|
||||||
|
CANDIDATE_FILE_EXTENSIONS = [
|
||||||
|
".xml",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class RssConverter(DocumentConverter):
|
class RssConverter(DocumentConverter):
|
||||||
"""Convert RSS / Atom type to markdown"""
|
"""Convert RSS / Atom type to markdown"""
|
||||||
|
|
||||||
def __init__(
|
def accepts(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
super().__init__(priority=priority)
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
def convert(
|
# Check for precise mimetypes and file extensions
|
||||||
self, local_path: str, **kwargs
|
if extension in PRECISE_FILE_EXTENSIONS:
|
||||||
) -> Union[None, DocumentConverterResult]:
|
return True
|
||||||
# Bail if not RSS type
|
|
||||||
extension = kwargs.get("file_extension", "")
|
for prefix in PRECISE_MIME_TYPE_PREFIXES:
|
||||||
if extension.lower() not in [".xml", ".rss", ".atom"]:
|
if mimetype.startswith(prefix):
|
||||||
return None
|
return True
|
||||||
|
|
||||||
|
# Check for precise mimetypes and file extensions
|
||||||
|
if extension in CANDIDATE_FILE_EXTENSIONS:
|
||||||
|
return self._check_xml(file_stream)
|
||||||
|
|
||||||
|
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return self._check_xml(file_stream)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _check_xml(self, file_stream: BinaryIO) -> bool:
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
try:
|
try:
|
||||||
doc = minidom.parse(local_path)
|
doc = minidom.parse(file_stream)
|
||||||
|
return self._feed_type(doc) is not None
|
||||||
except BaseException as _:
|
except BaseException as _:
|
||||||
return None
|
pass
|
||||||
result = None
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _feed_type(self, doc: Any) -> str:
|
||||||
if doc.getElementsByTagName("rss"):
|
if doc.getElementsByTagName("rss"):
|
||||||
# A RSS feed must have a root element of <rss>
|
return "rss"
|
||||||
result = self._parse_rss_type(doc)
|
|
||||||
elif doc.getElementsByTagName("feed"):
|
elif doc.getElementsByTagName("feed"):
|
||||||
root = doc.getElementsByTagName("feed")[0]
|
root = doc.getElementsByTagName("feed")[0]
|
||||||
if root.getElementsByTagName("entry"):
|
if root.getElementsByTagName("entry"):
|
||||||
# An Atom feed must have a root element of <feed> and at least one <entry>
|
# An Atom feed must have a root element of <feed> and at least one <entry>
|
||||||
result = self._parse_atom_type(doc)
|
return "atom"
|
||||||
else:
|
return None
|
||||||
return None
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
doc = minidom.parse(file_stream)
|
||||||
|
feed_type = self._feed_type(doc)
|
||||||
|
|
||||||
|
if feed_type == "rss":
|
||||||
|
return self._parse_rss_type(doc)
|
||||||
|
elif feed_type == "atom":
|
||||||
|
return self._parse_atom_type(doc)
|
||||||
else:
|
else:
|
||||||
# not rss or atom
|
raise ValueError("Unknown feed type")
|
||||||
return None
|
|
||||||
|
|
||||||
return result
|
def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult:
|
||||||
|
|
||||||
def _parse_atom_type(
|
|
||||||
self, doc: minidom.Document
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
"""Parse the type of an Atom feed.
|
"""Parse the type of an Atom feed.
|
||||||
|
|
||||||
Returns None if the feed type is not recognized or something goes wrong.
|
Returns None if the feed type is not recognized or something goes wrong.
|
||||||
"""
|
"""
|
||||||
try:
|
root = doc.getElementsByTagName("feed")[0]
|
||||||
root = doc.getElementsByTagName("feed")[0]
|
title = self._get_data_by_tag_name(root, "title")
|
||||||
title = self._get_data_by_tag_name(root, "title")
|
subtitle = self._get_data_by_tag_name(root, "subtitle")
|
||||||
subtitle = self._get_data_by_tag_name(root, "subtitle")
|
entries = root.getElementsByTagName("entry")
|
||||||
entries = root.getElementsByTagName("entry")
|
md_text = f"# {title}\n"
|
||||||
md_text = f"# {title}\n"
|
if subtitle:
|
||||||
if subtitle:
|
md_text += f"{subtitle}\n"
|
||||||
md_text += f"{subtitle}\n"
|
for entry in entries:
|
||||||
for entry in entries:
|
entry_title = self._get_data_by_tag_name(entry, "title")
|
||||||
entry_title = self._get_data_by_tag_name(entry, "title")
|
entry_summary = self._get_data_by_tag_name(entry, "summary")
|
||||||
entry_summary = self._get_data_by_tag_name(entry, "summary")
|
entry_updated = self._get_data_by_tag_name(entry, "updated")
|
||||||
entry_updated = self._get_data_by_tag_name(entry, "updated")
|
entry_content = self._get_data_by_tag_name(entry, "content")
|
||||||
entry_content = self._get_data_by_tag_name(entry, "content")
|
|
||||||
|
|
||||||
if entry_title:
|
if entry_title:
|
||||||
md_text += f"\n## {entry_title}\n"
|
md_text += f"\n## {entry_title}\n"
|
||||||
if entry_updated:
|
if entry_updated:
|
||||||
md_text += f"Updated on: {entry_updated}\n"
|
md_text += f"Updated on: {entry_updated}\n"
|
||||||
if entry_summary:
|
if entry_summary:
|
||||||
md_text += self._parse_content(entry_summary)
|
md_text += self._parse_content(entry_summary)
|
||||||
if entry_content:
|
if entry_content:
|
||||||
md_text += self._parse_content(entry_content)
|
md_text += self._parse_content(entry_content)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=title,
|
markdown=md_text,
|
||||||
text_content=md_text,
|
title=title,
|
||||||
)
|
)
|
||||||
except BaseException as _:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _parse_rss_type(
|
def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult:
|
||||||
self, doc: minidom.Document
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
"""Parse the type of an RSS feed.
|
"""Parse the type of an RSS feed.
|
||||||
|
|
||||||
Returns None if the feed type is not recognized or something goes wrong.
|
Returns None if the feed type is not recognized or something goes wrong.
|
||||||
"""
|
"""
|
||||||
try:
|
root = doc.getElementsByTagName("rss")[0]
|
||||||
root = doc.getElementsByTagName("rss")[0]
|
channel = root.getElementsByTagName("channel")
|
||||||
channel = root.getElementsByTagName("channel")
|
if not channel:
|
||||||
if not channel:
|
|
||||||
return None
|
|
||||||
channel = channel[0]
|
|
||||||
channel_title = self._get_data_by_tag_name(channel, "title")
|
|
||||||
channel_description = self._get_data_by_tag_name(channel, "description")
|
|
||||||
items = channel.getElementsByTagName("item")
|
|
||||||
if channel_title:
|
|
||||||
md_text = f"# {channel_title}\n"
|
|
||||||
if channel_description:
|
|
||||||
md_text += f"{channel_description}\n"
|
|
||||||
if not items:
|
|
||||||
items = []
|
|
||||||
for item in items:
|
|
||||||
title = self._get_data_by_tag_name(item, "title")
|
|
||||||
description = self._get_data_by_tag_name(item, "description")
|
|
||||||
pubDate = self._get_data_by_tag_name(item, "pubDate")
|
|
||||||
content = self._get_data_by_tag_name(item, "content:encoded")
|
|
||||||
|
|
||||||
if title:
|
|
||||||
md_text += f"\n## {title}\n"
|
|
||||||
if pubDate:
|
|
||||||
md_text += f"Published on: {pubDate}\n"
|
|
||||||
if description:
|
|
||||||
md_text += self._parse_content(description)
|
|
||||||
if content:
|
|
||||||
md_text += self._parse_content(content)
|
|
||||||
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=channel_title,
|
|
||||||
text_content=md_text,
|
|
||||||
)
|
|
||||||
except BaseException as _:
|
|
||||||
print(traceback.format_exc())
|
|
||||||
return None
|
return None
|
||||||
|
channel = channel[0]
|
||||||
|
channel_title = self._get_data_by_tag_name(channel, "title")
|
||||||
|
channel_description = self._get_data_by_tag_name(channel, "description")
|
||||||
|
items = channel.getElementsByTagName("item")
|
||||||
|
if channel_title:
|
||||||
|
md_text = f"# {channel_title}\n"
|
||||||
|
if channel_description:
|
||||||
|
md_text += f"{channel_description}\n"
|
||||||
|
if not items:
|
||||||
|
items = []
|
||||||
|
for item in items:
|
||||||
|
title = self._get_data_by_tag_name(item, "title")
|
||||||
|
description = self._get_data_by_tag_name(item, "description")
|
||||||
|
pubDate = self._get_data_by_tag_name(item, "pubDate")
|
||||||
|
content = self._get_data_by_tag_name(item, "content:encoded")
|
||||||
|
|
||||||
|
if title:
|
||||||
|
md_text += f"\n## {title}\n"
|
||||||
|
if pubDate:
|
||||||
|
md_text += f"Published on: {pubDate}\n"
|
||||||
|
if description:
|
||||||
|
md_text += self._parse_content(description)
|
||||||
|
if content:
|
||||||
|
md_text += self._parse_content(content)
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
markdown=md_text,
|
||||||
|
title=channel_title,
|
||||||
|
)
|
||||||
|
|
||||||
def _parse_content(self, content: str) -> str:
|
def _parse_content(self, content: str) -> str:
|
||||||
"""Parse the content of an RSS feed item"""
|
"""Parse the content of an RSS feed item"""
|
||||||
|
|||||||
@@ -0,0 +1,43 @@
|
|||||||
|
import io
|
||||||
|
import sys
|
||||||
|
from typing import BinaryIO
|
||||||
|
from .._exceptions import MissingDependencyException
|
||||||
|
|
||||||
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
# Save reporting of any exceptions for later
|
||||||
|
_dependency_exc_info = None
|
||||||
|
try:
|
||||||
|
import speech_recognition as sr
|
||||||
|
import pydub
|
||||||
|
except ImportError:
|
||||||
|
# Preserve the error and stack trace for later
|
||||||
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str:
|
||||||
|
# Check for installed dependencies
|
||||||
|
if _dependency_exc_info is not None:
|
||||||
|
raise MissingDependencyException(
|
||||||
|
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
|
||||||
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
|
_dependency_exc_info[2]
|
||||||
|
)
|
||||||
|
|
||||||
|
if audio_format in ["wav", "aiff", "flac"]:
|
||||||
|
audio_source = file_stream
|
||||||
|
elif audio_format in ["mp3", "mp4"]:
|
||||||
|
audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format)
|
||||||
|
|
||||||
|
audio_source = io.BytesIO()
|
||||||
|
audio_segment.export(audio_source, format="wav")
|
||||||
|
audio_source.seek(0)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported audio format: {audio_format}")
|
||||||
|
|
||||||
|
recognizer = sr.Recognizer()
|
||||||
|
with sr.AudioFile(audio_source) as source:
|
||||||
|
audio = recognizer.record(source)
|
||||||
|
transcript = recognizer.recognize_google(audio).strip()
|
||||||
|
return "[No speech detected]" if transcript == "" else transcript
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
from typing import Union
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
|
||||||
from ._media_converter import MediaConverter
|
|
||||||
|
|
||||||
# Optional Transcription support
|
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
|
||||||
try:
|
|
||||||
import speech_recognition as sr
|
|
||||||
|
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class WavConverter(MediaConverter):
|
|
||||||
"""
|
|
||||||
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Bail if not a WAV
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() != ".wav":
|
|
||||||
return None
|
|
||||||
|
|
||||||
md_content = ""
|
|
||||||
|
|
||||||
# Add metadata
|
|
||||||
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
|
||||||
if metadata:
|
|
||||||
for f in [
|
|
||||||
"Title",
|
|
||||||
"Artist",
|
|
||||||
"Author",
|
|
||||||
"Band",
|
|
||||||
"Album",
|
|
||||||
"Genre",
|
|
||||||
"Track",
|
|
||||||
"DateTimeOriginal",
|
|
||||||
"CreateDate",
|
|
||||||
"Duration",
|
|
||||||
]:
|
|
||||||
if f in metadata:
|
|
||||||
md_content += f"{f}: {metadata[f]}\n"
|
|
||||||
|
|
||||||
# Transcribe
|
|
||||||
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
|
||||||
try:
|
|
||||||
transcript = self._transcribe_audio(local_path)
|
|
||||||
md_content += "\n\n### Audio Transcript:\n" + (
|
|
||||||
"[No speech detected]" if transcript == "" else transcript
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
md_content += (
|
|
||||||
"\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
|
||||||
)
|
|
||||||
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None,
|
|
||||||
text_content=md_content.strip(),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _transcribe_audio(self, local_path) -> str:
|
|
||||||
recognizer = sr.Recognizer()
|
|
||||||
with sr.AudioFile(local_path) as source:
|
|
||||||
audio = recognizer.record(source)
|
|
||||||
return recognizer.recognize_google(audio).strip()
|
|
||||||
@@ -1,35 +1,63 @@
|
|||||||
|
import io
|
||||||
import re
|
import re
|
||||||
|
from typing import Any, BinaryIO, Optional
|
||||||
from typing import Any, Union
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/html",
|
||||||
|
"application/xhtml",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class WikipediaConverter(DocumentConverter):
|
class WikipediaConverter(DocumentConverter):
|
||||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||||
|
|
||||||
def __init__(
|
def accepts(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
super().__init__(priority=priority)
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Make sure we're dealing with HTML content *from* Wikipedia.
|
||||||
|
"""
|
||||||
|
|
||||||
|
url = stream_info.url or ""
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
|
||||||
|
# Not a Wikipedia URL
|
||||||
|
return False
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Not HTML content
|
||||||
|
return False
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if not Wikipedia
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
) -> DocumentConverterResult:
|
||||||
return None
|
# Parse the stream
|
||||||
url = kwargs.get("url", "")
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
|
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||||
return None
|
|
||||||
|
|
||||||
# Parse the file
|
|
||||||
soup = None
|
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
@@ -56,6 +84,6 @@ class WikipediaConverter(DocumentConverter):
|
|||||||
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=webpage_text,
|
||||||
title=main_title,
|
title=main_title,
|
||||||
text_content=webpage_text,
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,10 +1,9 @@
|
|||||||
import sys
|
import sys
|
||||||
|
from typing import BinaryIO, Any
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
@@ -22,23 +21,51 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
_xls_dependency_exc_info = sys.exc_info()
|
_xls_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
]
|
||||||
|
ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
|
||||||
|
|
||||||
class XlsxConverter(HtmlConverter):
|
ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.ms-excel",
|
||||||
|
"application/excel",
|
||||||
|
]
|
||||||
|
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
|
||||||
|
|
||||||
|
|
||||||
|
class XlsxConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self):
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
super().__init__()
|
||||||
):
|
self._html_converter = HtmlConverter()
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def accepts(
|
||||||
# Bail if not a XLSX
|
self,
|
||||||
extension = kwargs.get("file_extension", "")
|
file_stream: BinaryIO,
|
||||||
if extension.lower() != ".xlsx":
|
stream_info: StreamInfo,
|
||||||
return None
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Check the dependencies
|
# Check the dependencies
|
||||||
if _xlsx_dependency_exc_info is not None:
|
if _xlsx_dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
@@ -47,34 +74,58 @@ class XlsxConverter(HtmlConverter):
|
|||||||
extension=".xlsx",
|
extension=".xlsx",
|
||||||
feature="xlsx",
|
feature="xlsx",
|
||||||
)
|
)
|
||||||
) from _xlsx_dependency_exc_info[1].with_traceback(
|
) from _xlsx_dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_xlsx_dependency_exc_info[2]
|
_xlsx_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
html_content = sheets[s].to_html(index=False)
|
html_content = sheets[s].to_html(index=False)
|
||||||
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
md_content += (
|
||||||
|
self._html_converter.convert_string(html_content).markdown.strip()
|
||||||
|
+ "\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
title=None,
|
|
||||||
text_content=md_content.strip(),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class XlsConverter(HtmlConverter):
|
class XlsConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def __init__(self):
|
||||||
# Bail if not a XLS
|
super().__init__()
|
||||||
extension = kwargs.get("file_extension", "")
|
self._html_converter = HtmlConverter()
|
||||||
if extension.lower() != ".xls":
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
def accepts(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Load the dependencies
|
# Load the dependencies
|
||||||
if _xls_dependency_exc_info is not None:
|
if _xls_dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
@@ -83,18 +134,20 @@ class XlsConverter(HtmlConverter):
|
|||||||
extension=".xls",
|
extension=".xls",
|
||||||
feature="xls",
|
feature="xls",
|
||||||
)
|
)
|
||||||
) from _xls_dependency_exc_info[1].with_traceback(
|
) from _xls_dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_xls_dependency_exc_info[2]
|
_xls_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
html_content = sheets[s].to_html(index=False)
|
html_content = sheets[s].to_html(index=False)
|
||||||
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
md_content += (
|
||||||
|
self._html_converter.convert_string(html_content).markdown.strip()
|
||||||
|
+ "\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
title=None,
|
|
||||||
text_content=md_content.strip(),
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -1,14 +1,15 @@
|
|||||||
import re
|
import sys
|
||||||
import json
|
import json
|
||||||
import urllib.parse
|
|
||||||
import time
|
import time
|
||||||
|
import io
|
||||||
from typing import Any, Union, Dict, List
|
import re
|
||||||
from urllib.parse import parse_qs, urlparse
|
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
||||||
|
from urllib.parse import parse_qs, urlparse, unquote
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
# Optional YouTube transcription support
|
# Optional YouTube transcription support
|
||||||
try:
|
try:
|
||||||
@@ -19,53 +20,59 @@ except ModuleNotFoundError:
|
|||||||
IS_YOUTUBE_TRANSCRIPT_CAPABLE = False
|
IS_YOUTUBE_TRANSCRIPT_CAPABLE = False
|
||||||
|
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/html",
|
||||||
|
"application/xhtml",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class YouTubeConverter(DocumentConverter):
|
class YouTubeConverter(DocumentConverter):
|
||||||
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
||||||
|
|
||||||
def __init__(
|
def accepts(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self,
|
||||||
):
|
file_stream: BinaryIO,
|
||||||
super().__init__(priority=priority)
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Make sure we're dealing with HTML content *from* YouTube.
|
||||||
|
"""
|
||||||
|
url = stream_info.url or ""
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
def retry_operation(self, operation, retries=3, delay=2):
|
url = unquote(url)
|
||||||
"""Retries the operation if it fails."""
|
|
||||||
attempt = 0
|
|
||||||
while attempt < retries:
|
|
||||||
try:
|
|
||||||
return operation() # Attempt the operation
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Attempt {attempt + 1} failed: {e}")
|
|
||||||
if attempt < retries - 1:
|
|
||||||
time.sleep(delay) # Wait before retrying
|
|
||||||
attempt += 1
|
|
||||||
# If all attempts fail, raise the last exception
|
|
||||||
raise Exception(f"Operation failed after {retries} attempts.")
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self, local_path: str, **kwargs: Any
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Bail if not YouTube
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
|
||||||
return None
|
|
||||||
url = kwargs.get("url", "")
|
|
||||||
|
|
||||||
url = urllib.parse.unquote(url)
|
|
||||||
url = url.replace(r"\?", "?").replace(r"\=", "=")
|
url = url.replace(r"\?", "?").replace(r"\=", "=")
|
||||||
|
|
||||||
if not url.startswith("https://www.youtube.com/watch?"):
|
if not url.startswith("https://www.youtube.com/watch?"):
|
||||||
return None
|
# Not a YouTube URL
|
||||||
|
return False
|
||||||
|
|
||||||
# Parse the file with error handling
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
try:
|
return True
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error reading YouTube page: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
if not soup.title or not soup.title.string:
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
return None
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Not HTML content
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
# Parse the stream
|
||||||
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
|
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||||
|
|
||||||
# Read the meta tags
|
# Read the meta tags
|
||||||
metadata: Dict[str, str] = {"title": soup.title.string}
|
metadata: Dict[str, str] = {"title": soup.title.string}
|
||||||
@@ -126,7 +133,7 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
|
|
||||||
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
||||||
transcript_text = ""
|
transcript_text = ""
|
||||||
parsed_url = urlparse(url) # type: ignore
|
parsed_url = urlparse(stream_info.url) # type: ignore
|
||||||
params = parse_qs(parsed_url.query) # type: ignore
|
params = parse_qs(parsed_url.query) # type: ignore
|
||||||
if "v" in params and params["v"][0]:
|
if "v" in params and params["v"][0]:
|
||||||
video_id = str(params["v"][0])
|
video_id = str(params["v"][0])
|
||||||
@@ -135,7 +142,7 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
"youtube_transcript_languages", ("en",)
|
"youtube_transcript_languages", ("en",)
|
||||||
)
|
)
|
||||||
# Retry the transcript fetching operation
|
# Retry the transcript fetching operation
|
||||||
transcript = self.retry_operation(
|
transcript = self._retry_operation(
|
||||||
lambda: YouTubeTranscriptApi.get_transcript(
|
lambda: YouTubeTranscriptApi.get_transcript(
|
||||||
video_id, languages=youtube_transcript_languages
|
video_id, languages=youtube_transcript_languages
|
||||||
),
|
),
|
||||||
@@ -158,8 +165,8 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
assert isinstance(title, str)
|
assert isinstance(title, str)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=webpage_text,
|
||||||
title=title,
|
title=title,
|
||||||
text_content=webpage_text,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get(
|
def _get(
|
||||||
@@ -188,3 +195,17 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
if result := self._findKey(v, key):
|
if result := self._findKey(v, key):
|
||||||
return result
|
return result
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _retry_operation(self, operation, retries=3, delay=2):
|
||||||
|
"""Retries the operation if it fails."""
|
||||||
|
attempt = 0
|
||||||
|
while attempt < retries:
|
||||||
|
try:
|
||||||
|
return operation() # Attempt the operation
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Attempt {attempt + 1} failed: {e}")
|
||||||
|
if attempt < retries - 1:
|
||||||
|
time.sleep(delay) # Wait before retrying
|
||||||
|
attempt += 1
|
||||||
|
# If all attempts fail, raise the last exception
|
||||||
|
raise Exception(f"Operation failed after {retries} attempts.")
|
||||||
|
|||||||
@@ -1,9 +1,23 @@
|
|||||||
import os
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
import shutil
|
import io
|
||||||
from typing import Any, Union
|
import os
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from typing import BinaryIO, Any, TYPE_CHECKING
|
||||||
|
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
from .._exceptions import UnsupportedFormatException, FileConversionException
|
||||||
|
|
||||||
|
# Break otherwise circular import for type hinting
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .._markitdown import MarkItDown
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/zip",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".zip"]
|
||||||
|
|
||||||
|
|
||||||
class ZipConverter(DocumentConverter):
|
class ZipConverter(DocumentConverter):
|
||||||
@@ -46,99 +60,58 @@ class ZipConverter(DocumentConverter):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self,
|
||||||
|
*,
|
||||||
|
markitdown: "MarkItDown",
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__()
|
||||||
|
self._markitdown = markitdown
|
||||||
|
|
||||||
|
def accepts(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if not a ZIP
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
if extension.lower() != ".zip":
|
) -> DocumentConverterResult:
|
||||||
return None
|
file_path = stream_info.url or stream_info.local_path or stream_info.filename
|
||||||
|
md_content = f"Content from the zip file `{file_path}`:\n\n"
|
||||||
|
|
||||||
# Get parent converters list if available
|
with zipfile.ZipFile(file_stream, "r") as zipObj:
|
||||||
parent_converters = kwargs.get("_parent_converters", [])
|
for name in zipObj.namelist():
|
||||||
if not parent_converters:
|
try:
|
||||||
return DocumentConverterResult(
|
z_file_stream = io.BytesIO(zipObj.read(name))
|
||||||
title=None,
|
z_file_stream_info = StreamInfo(
|
||||||
text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
|
extension=os.path.splitext(name)[1],
|
||||||
)
|
filename=os.path.basename(name),
|
||||||
|
)
|
||||||
|
result = self._markitdown.convert_stream(
|
||||||
|
stream=z_file_stream,
|
||||||
|
stream_info=z_file_stream_info,
|
||||||
|
)
|
||||||
|
if result is not None:
|
||||||
|
md_content += f"## File: {name}\n\n"
|
||||||
|
md_content += result.markdown + "\n\n"
|
||||||
|
except UnsupportedFormatException:
|
||||||
|
pass
|
||||||
|
except FileConversionException:
|
||||||
|
pass
|
||||||
|
|
||||||
extracted_zip_folder_name = (
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
|
|
||||||
)
|
|
||||||
extraction_dir = os.path.normpath(
|
|
||||||
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
|
|
||||||
)
|
|
||||||
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Extract the zip file safely
|
|
||||||
with zipfile.ZipFile(local_path, "r") as zipObj:
|
|
||||||
# Bail if we discover it's an Office OOXML file
|
|
||||||
if "[Content_Types].xml" in zipObj.namelist():
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Safeguard against path traversal
|
|
||||||
for member in zipObj.namelist():
|
|
||||||
member_path = os.path.normpath(os.path.join(extraction_dir, member))
|
|
||||||
if (
|
|
||||||
not os.path.commonprefix([extraction_dir, member_path])
|
|
||||||
== extraction_dir
|
|
||||||
):
|
|
||||||
raise ValueError(
|
|
||||||
f"Path traversal detected in zip file: {member}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract all files safely
|
|
||||||
zipObj.extractall(path=extraction_dir)
|
|
||||||
|
|
||||||
# Process each extracted file
|
|
||||||
for root, dirs, files in os.walk(extraction_dir):
|
|
||||||
for name in files:
|
|
||||||
file_path = os.path.join(root, name)
|
|
||||||
relative_path = os.path.relpath(file_path, extraction_dir)
|
|
||||||
|
|
||||||
# Get file extension
|
|
||||||
_, file_extension = os.path.splitext(name)
|
|
||||||
|
|
||||||
# Update kwargs for the file
|
|
||||||
file_kwargs = kwargs.copy()
|
|
||||||
file_kwargs["file_extension"] = file_extension
|
|
||||||
file_kwargs["_parent_converters"] = parent_converters
|
|
||||||
|
|
||||||
# Try converting the file using available converters
|
|
||||||
for converter in parent_converters:
|
|
||||||
# Skip the zip converter to avoid infinite recursion
|
|
||||||
if isinstance(converter, ZipConverter):
|
|
||||||
continue
|
|
||||||
|
|
||||||
result = converter.convert(file_path, **file_kwargs)
|
|
||||||
if result is not None:
|
|
||||||
md_content += f"\n## File: {relative_path}\n\n"
|
|
||||||
md_content += result.text_content + "\n\n"
|
|
||||||
break
|
|
||||||
|
|
||||||
# Clean up extracted files if specified
|
|
||||||
if kwargs.get("cleanup_extracted", True):
|
|
||||||
shutil.rmtree(extraction_dir)
|
|
||||||
|
|
||||||
return DocumentConverterResult(title=None, text_content=md_content.strip())
|
|
||||||
|
|
||||||
except zipfile.BadZipFile:
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None,
|
|
||||||
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
|
|
||||||
)
|
|
||||||
except ValueError as ve:
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None,
|
|
||||||
text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None,
|
|
||||||
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from markitdown import __version__
|
|||||||
try:
|
try:
|
||||||
from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
|
from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
|
from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS # type: ignore
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
|||||||
BIN
packages/markitdown/tests/test_files/test.m4a
vendored
Executable file
BIN
packages/markitdown/tests/test_files/test.m4a
vendored
Executable file
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.mp3
vendored
Normal file
BIN
packages/markitdown/tests/test_files/test.mp3
vendored
Normal file
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.pdf
vendored
Normal file
BIN
packages/markitdown/tests/test_files/test.pdf
vendored
Normal file
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.pptx
vendored
BIN
packages/markitdown/tests/test_files/test.pptx
vendored
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.wav
vendored
Normal file
BIN
packages/markitdown/tests/test_files/test.wav
vendored
Normal file
Binary file not shown.
@@ -1,89 +1,89 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "0f61db80",
|
"id": "0f61db80",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Test Notebook"
|
"# Test Notebook"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 11,
|
||||||
"id": "3f2a5bbd",
|
"id": "3f2a5bbd",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"markitdown\n"
|
"markitdown\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"print('markitdown')"
|
"print(\"markitdown\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "9b9c0468",
|
"id": "9b9c0468",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Code Cell Below"
|
"## Code Cell Below"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 10,
|
||||||
"id": "37d8088a",
|
"id": "37d8088a",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"42\n"
|
"42\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# comment in code\n",
|
"# comment in code\n",
|
||||||
"print(42)"
|
"print(42)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "2e3177bd",
|
"id": "2e3177bd",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"End\n",
|
"End\n",
|
||||||
"\n",
|
"\n",
|
||||||
"---"
|
"---"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
"name": "ipython",
|
"name": "ipython",
|
||||||
"version": 3
|
"version": 3
|
||||||
},
|
},
|
||||||
"file_extension": ".py",
|
"file_extension": ".py",
|
||||||
"mimetype": "text/x-python",
|
"mimetype": "text/x-python",
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.8"
|
"version": "3.12.8"
|
||||||
},
|
},
|
||||||
"title": "Test Notebook Title"
|
"title": "Test Notebook Title"
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 5
|
"nbformat_minor": 5
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,13 +2,20 @@
|
|||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
import openai
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from warnings import catch_warnings, resetwarnings
|
import warnings
|
||||||
|
|
||||||
from markitdown import MarkItDown, UnsupportedFormatException, FileConversionException
|
from markitdown import (
|
||||||
|
MarkItDown,
|
||||||
|
UnsupportedFormatException,
|
||||||
|
FileConversionException,
|
||||||
|
StreamInfo,
|
||||||
|
)
|
||||||
|
from markitdown._stream_info import _guess_stream_info_from_stream
|
||||||
|
|
||||||
skip_remote = (
|
skip_remote = (
|
||||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||||
@@ -35,6 +42,13 @@ JPG_TEST_EXIFTOOL = {
|
|||||||
"DateTimeOriginal": "2024:03:14 22:10:00",
|
"DateTimeOriginal": "2024:03:14 22:10:00",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MP3_TEST_EXIFTOOL = {
|
||||||
|
"Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e",
|
||||||
|
"Artist": "Artist Name Test String",
|
||||||
|
"Album": "Album Name Test String",
|
||||||
|
"SampleRate": "48000",
|
||||||
|
}
|
||||||
|
|
||||||
PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
|
PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
|
||||||
PDF_TEST_STRINGS = [
|
PDF_TEST_STRINGS = [
|
||||||
"While there is contemporaneous exploration of multi-agent approaches"
|
"While there is contemporaneous exploration of multi-agent approaches"
|
||||||
@@ -162,6 +176,107 @@ def validate_strings(result, expected_strings, exclude_strings=None):
|
|||||||
assert string not in text_content
|
assert string not in text_content
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_info_operations() -> None:
|
||||||
|
"""Test operations performed on StreamInfo objects."""
|
||||||
|
|
||||||
|
stream_info_original = StreamInfo(
|
||||||
|
mimetype="mimetype.1",
|
||||||
|
extension="extension.1",
|
||||||
|
charset="charset.1",
|
||||||
|
filename="filename.1",
|
||||||
|
local_path="local_path.1",
|
||||||
|
url="url.1",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check updating all attributes by keyword
|
||||||
|
keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
|
||||||
|
for keyword in keywords:
|
||||||
|
updated_stream_info = stream_info_original.copy_and_update(
|
||||||
|
**{keyword: f"{keyword}.2"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make sure the targted attribute is updated
|
||||||
|
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
|
||||||
|
|
||||||
|
# Make sure the other attributes are unchanged
|
||||||
|
for k in keywords:
|
||||||
|
if k != keyword:
|
||||||
|
assert getattr(stream_info_original, k) == getattr(
|
||||||
|
updated_stream_info, k
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check updating all attributes by passing a new StreamInfo object
|
||||||
|
keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
|
||||||
|
for keyword in keywords:
|
||||||
|
updated_stream_info = stream_info_original.copy_and_update(
|
||||||
|
StreamInfo(**{keyword: f"{keyword}.2"})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make sure the targted attribute is updated
|
||||||
|
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
|
||||||
|
|
||||||
|
# Make sure the other attributes are unchanged
|
||||||
|
for k in keywords:
|
||||||
|
if k != keyword:
|
||||||
|
assert getattr(stream_info_original, k) == getattr(
|
||||||
|
updated_stream_info, k
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check mixing and matching
|
||||||
|
updated_stream_info = stream_info_original.copy_and_update(
|
||||||
|
StreamInfo(extension="extension.2", filename="filename.2"),
|
||||||
|
mimetype="mimetype.3",
|
||||||
|
charset="charset.3",
|
||||||
|
)
|
||||||
|
assert updated_stream_info.extension == "extension.2"
|
||||||
|
assert updated_stream_info.filename == "filename.2"
|
||||||
|
assert updated_stream_info.mimetype == "mimetype.3"
|
||||||
|
assert updated_stream_info.charset == "charset.3"
|
||||||
|
assert updated_stream_info.local_path == "local_path.1"
|
||||||
|
assert updated_stream_info.url == "url.1"
|
||||||
|
|
||||||
|
# Check multiple StreamInfo objects
|
||||||
|
updated_stream_info = stream_info_original.copy_and_update(
|
||||||
|
StreamInfo(extension="extension.4", filename="filename.5"),
|
||||||
|
StreamInfo(mimetype="mimetype.6", charset="charset.7"),
|
||||||
|
)
|
||||||
|
assert updated_stream_info.extension == "extension.4"
|
||||||
|
assert updated_stream_info.filename == "filename.5"
|
||||||
|
assert updated_stream_info.mimetype == "mimetype.6"
|
||||||
|
assert updated_stream_info.charset == "charset.7"
|
||||||
|
assert updated_stream_info.local_path == "local_path.1"
|
||||||
|
assert updated_stream_info.url == "url.1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_info_guesses() -> None:
|
||||||
|
"""Test StreamInfo guesses based on stream content."""
|
||||||
|
|
||||||
|
test_tuples = [
|
||||||
|
(
|
||||||
|
os.path.join(TEST_FILES_DIR, "test.xlsx"),
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
os.path.join(TEST_FILES_DIR, "test.docx"),
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
os.path.join(TEST_FILES_DIR, "test.pptx"),
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
),
|
||||||
|
(os.path.join(TEST_FILES_DIR, "test.xls"), "application/vnd.ms-excel"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for file_path, expected_mimetype in test_tuples:
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
guesses = _guess_stream_info_from_stream(
|
||||||
|
f, filename_hint=os.path.basename(file_path)
|
||||||
|
)
|
||||||
|
assert len(guesses) > 0
|
||||||
|
assert guesses[0].mimetype == expected_mimetype
|
||||||
|
assert guesses[0].extension == os.path.splitext(file_path)[1]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_remote,
|
skip_remote,
|
||||||
reason="do not run tests that query external urls",
|
reason="do not run tests that query external urls",
|
||||||
@@ -183,7 +298,6 @@ def test_markitdown_remote() -> None:
|
|||||||
assert test_string in result.text_content
|
assert test_string in result.text_content
|
||||||
|
|
||||||
# Youtube
|
# Youtube
|
||||||
# TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
|
|
||||||
result = markitdown.convert(YOUTUBE_TEST_URL)
|
result = markitdown.convert(YOUTUBE_TEST_URL)
|
||||||
for test_string in YOUTUBE_TEST_STRINGS:
|
for test_string in YOUTUBE_TEST_STRINGS:
|
||||||
assert test_string in result.text_content
|
assert test_string in result.text_content
|
||||||
@@ -192,6 +306,10 @@ def test_markitdown_remote() -> None:
|
|||||||
def test_markitdown_local() -> None:
|
def test_markitdown_local() -> None:
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test PDF processing
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pdf"))
|
||||||
|
validate_strings(result, PDF_TEST_STRINGS)
|
||||||
|
|
||||||
# Test XLSX processing
|
# Test XLSX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
||||||
validate_strings(result, XLSX_TEST_STRINGS)
|
validate_strings(result, XLSX_TEST_STRINGS)
|
||||||
@@ -230,10 +348,6 @@ def test_markitdown_local() -> None:
|
|||||||
)
|
)
|
||||||
validate_strings(result, BLOG_TEST_STRINGS)
|
validate_strings(result, BLOG_TEST_STRINGS)
|
||||||
|
|
||||||
# Test ZIP file processing
|
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
|
|
||||||
validate_strings(result, XLSX_TEST_STRINGS)
|
|
||||||
|
|
||||||
# Test Wikipedia processing
|
# Test Wikipedia processing
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
|
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
|
||||||
@@ -254,24 +368,135 @@ def test_markitdown_local() -> None:
|
|||||||
for test_string in RSS_TEST_STRINGS:
|
for test_string in RSS_TEST_STRINGS:
|
||||||
assert test_string in text_content
|
assert test_string in text_content
|
||||||
|
|
||||||
## Test non-UTF-8 encoding
|
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
|
||||||
validate_strings(result, CSV_CP932_TEST_STRINGS)
|
|
||||||
|
|
||||||
# Test MSG (Outlook email) processing
|
# Test MSG (Outlook email) processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
|
||||||
validate_strings(result, MSG_TEST_STRINGS)
|
validate_strings(result, MSG_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test non-UTF-8 encoding
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
||||||
|
validate_strings(result, CSV_CP932_TEST_STRINGS)
|
||||||
|
|
||||||
# Test JSON processing
|
# Test JSON processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
|
||||||
validate_strings(result, JSON_TEST_STRINGS)
|
validate_strings(result, JSON_TEST_STRINGS)
|
||||||
|
|
||||||
|
# # Test ZIP file processing
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
|
||||||
|
validate_strings(result, DOCX_TEST_STRINGS)
|
||||||
|
validate_strings(result, XLSX_TEST_STRINGS)
|
||||||
|
validate_strings(result, BLOG_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test input from a stream
|
||||||
|
input_data = b"<html><body><h1>Test</h1></body></html>"
|
||||||
|
result = markitdown.convert_stream(io.BytesIO(input_data))
|
||||||
|
assert "# Test" in result.text_content
|
||||||
|
|
||||||
# Test input with leading blank characters
|
# Test input with leading blank characters
|
||||||
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
|
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
|
||||||
result = markitdown.convert_stream(io.BytesIO(input_data))
|
result = markitdown.convert_stream(io.BytesIO(input_data))
|
||||||
assert "# Test" in result.text_content
|
assert "# Test" in result.text_content
|
||||||
|
|
||||||
|
|
||||||
|
def test_markitdown_streams() -> None:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test PDF processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.pdf"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".pdf")
|
||||||
|
validate_strings(result, PDF_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test XLSX processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".xlsx")
|
||||||
|
validate_strings(result, XLSX_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test XLS processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".xls")
|
||||||
|
for test_string in XLS_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test DOCX processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".docx")
|
||||||
|
validate_strings(result, DOCX_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test DOCX processing, with comments
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
|
||||||
|
result = markitdown.convert(
|
||||||
|
f,
|
||||||
|
file_extension=".docx",
|
||||||
|
style_map="comment-reference => ",
|
||||||
|
)
|
||||||
|
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test DOCX processing, with comments and setting style_map on init
|
||||||
|
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
|
||||||
|
result = markitdown_with_style_map.convert(f, file_extension=".docx")
|
||||||
|
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test PPTX processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".pptx")
|
||||||
|
validate_strings(result, PPTX_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test HTML processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_blog.html"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL)
|
||||||
|
validate_strings(result, BLOG_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test Wikipedia processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL)
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
|
||||||
|
|
||||||
|
# Test Bing processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_serp.html"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL)
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
|
||||||
|
|
||||||
|
# Test RSS processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".xml")
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
for test_string in RSS_TEST_STRINGS:
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test MSG (Outlook email) processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".msg")
|
||||||
|
validate_strings(result, MSG_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test JSON processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".json")
|
||||||
|
validate_strings(result, JSON_TEST_STRINGS)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
skip_remote,
|
||||||
|
reason="do not run remotely run speech transcription tests",
|
||||||
|
)
|
||||||
|
def test_speech_transcription() -> None:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test WAV files, MP3 and M4A files
|
||||||
|
for file_name in ["test.wav", "test.mp3", "test.m4a"]:
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name))
|
||||||
|
result_lower = result.text_content.lower()
|
||||||
|
assert (
|
||||||
|
("1" in result_lower or "one" in result_lower)
|
||||||
|
and ("2" in result_lower or "two" in result_lower)
|
||||||
|
and ("3" in result_lower or "three" in result_lower)
|
||||||
|
and ("4" in result_lower or "four" in result_lower)
|
||||||
|
and ("5" in result_lower or "five" in result_lower)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_exceptions() -> None:
|
def test_exceptions() -> None:
|
||||||
# Check that an exception is raised when trying to convert an unsupported format
|
# Check that an exception is raised when trying to convert an unsupported format
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
@@ -295,17 +520,20 @@ def test_markitdown_exiftool() -> None:
|
|||||||
# Test the automatic discovery of exiftool throws a warning
|
# Test the automatic discovery of exiftool throws a warning
|
||||||
# and is disabled
|
# and is disabled
|
||||||
try:
|
try:
|
||||||
with catch_warnings(record=True) as w:
|
warnings.simplefilter("default")
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||||
assert len(w) == 1
|
assert len(w) == 1
|
||||||
assert w[0].category is DeprecationWarning
|
assert w[0].category is DeprecationWarning
|
||||||
assert result.text_content.strip() == ""
|
assert result.text_content.strip() == ""
|
||||||
finally:
|
finally:
|
||||||
resetwarnings()
|
warnings.resetwarnings()
|
||||||
|
|
||||||
|
which_exiftool = shutil.which("exiftool")
|
||||||
|
assert which_exiftool is not None
|
||||||
|
|
||||||
# Test explicitly setting the location of exiftool
|
# Test explicitly setting the location of exiftool
|
||||||
which_exiftool = shutil.which("exiftool")
|
|
||||||
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||||
for key in JPG_TEST_EXIFTOOL:
|
for key in JPG_TEST_EXIFTOOL:
|
||||||
@@ -320,6 +548,12 @@ def test_markitdown_exiftool() -> None:
|
|||||||
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
||||||
assert target in result.text_content
|
assert target in result.text_content
|
||||||
|
|
||||||
|
# Test some other media types
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3"))
|
||||||
|
for key in MP3_TEST_EXIFTOOL:
|
||||||
|
target = f"{key}: {MP3_TEST_EXIFTOOL[key]}"
|
||||||
|
assert target in result.text_content
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_llm,
|
skip_llm,
|
||||||
@@ -330,7 +564,6 @@ def test_markitdown_llm() -> None:
|
|||||||
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||||||
|
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
|
||||||
|
|
||||||
for test_string in LLM_TEST_STRINGS:
|
for test_string in LLM_TEST_STRINGS:
|
||||||
assert test_string in result.text_content
|
assert test_string in result.text_content
|
||||||
|
|
||||||
@@ -339,12 +572,24 @@ def test_markitdown_llm() -> None:
|
|||||||
for test_string in ["red", "circle", "blue", "square"]:
|
for test_string in ["red", "circle", "blue", "square"]:
|
||||||
assert test_string in result.text_content.lower()
|
assert test_string in result.text_content.lower()
|
||||||
|
|
||||||
|
# Images embedded in PPTX files
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
|
||||||
|
# LLM Captions are included
|
||||||
|
for test_string in LLM_TEST_STRINGS:
|
||||||
|
assert test_string in result.text_content
|
||||||
|
# Standard alt text is included
|
||||||
|
validate_strings(result, PPTX_TEST_STRINGS)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
|
test_stream_info_operations()
|
||||||
|
test_stream_info_guesses()
|
||||||
test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
test_markitdown_local()
|
test_markitdown_local()
|
||||||
|
test_markitdown_streams()
|
||||||
|
test_speech_transcription()
|
||||||
test_exceptions()
|
test_exceptions()
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
# test_markitdown_llm()
|
test_markitdown_llm()
|
||||||
print("All tests passed!")
|
print("All tests passed!")
|
||||||
|
|||||||
Reference in New Issue
Block a user