Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
de2c56ffbc |
@@ -6,8 +6,7 @@
|
|||||||
|
|
||||||
> [!IMPORTANT]
|
> [!IMPORTANT]
|
||||||
> Breaking changes between 0.0.1 to 0.1.0:
|
> Breaking changes between 0.0.1 to 0.1.0:
|
||||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior.
|
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]~=0.1.0a1'` to have backward-compatible behavior.
|
||||||
> * convert\_stream() now requires a binary file-like object (e.g., a file opened in binary mode, or an io.BytesIO object). This is a breaking change from the previous version, where it previously also accepted text file-like objects, like io.StringIO.
|
|
||||||
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
|
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
|
||||||
|
|
||||||
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
|
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
|
||||||
@@ -15,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
|
|||||||
At present, MarkItDown supports:
|
At present, MarkItDown supports:
|
||||||
|
|
||||||
- PDF
|
- PDF
|
||||||
- PowerPoint
|
- PowerPoint (reading in top-to-bottom, left-to-right order)
|
||||||
- Word
|
- Word
|
||||||
- Excel
|
- Excel
|
||||||
- Images (EXIF metadata and OCR)
|
- Images (EXIF metadata and OCR)
|
||||||
@@ -24,7 +23,6 @@ At present, MarkItDown supports:
|
|||||||
- Text-based formats (CSV, JSON, XML)
|
- Text-based formats (CSV, JSON, XML)
|
||||||
- ZIP files (iterates over contents)
|
- ZIP files (iterates over contents)
|
||||||
- Youtube URLs
|
- Youtube URLs
|
||||||
- EPubs
|
|
||||||
- ... and more!
|
- ... and more!
|
||||||
|
|
||||||
## Why Markdown?
|
## Why Markdown?
|
||||||
@@ -38,7 +36,7 @@ are also highly token-efficient.
|
|||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:
|
To install MarkItDown, use pip: `pip install 'markitdown[all]~=0.1.0a1'`. Alternatively, you can install it from the source:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone git@github.com:microsoft/markitdown.git
|
git clone git@github.com:microsoft/markitdown.git
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ dependencies = [
|
|||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"requests",
|
"requests",
|
||||||
"markdownify",
|
"markdownify",
|
||||||
"magika~=0.6.1",
|
"magika>=0.6.0rc1",
|
||||||
"charset-normalizer",
|
"charset-normalizer",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -42,7 +42,7 @@ all = [
|
|||||||
"olefile",
|
"olefile",
|
||||||
"pydub",
|
"pydub",
|
||||||
"SpeechRecognition",
|
"SpeechRecognition",
|
||||||
"youtube-transcript-api~=1.0.0",
|
"youtube-transcript-api",
|
||||||
"azure-ai-documentintelligence",
|
"azure-ai-documentintelligence",
|
||||||
"azure-identity"
|
"azure-identity"
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.1.1"
|
__version__ = "0.1.0a2"
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
import codecs
|
import codecs
|
||||||
import locale
|
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from importlib.metadata import entry_points
|
from importlib.metadata import entry_points
|
||||||
from .__about__ import __version__
|
from .__about__ import __version__
|
||||||
@@ -105,12 +104,6 @@ def main():
|
|||||||
help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
|
help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--keep-data-uris",
|
|
||||||
action="store_true",
|
|
||||||
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("filename", nargs="?")
|
parser.add_argument("filename", nargs="?")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@@ -146,7 +139,7 @@ def main():
|
|||||||
else:
|
else:
|
||||||
charset_hint = None
|
charset_hint = None
|
||||||
|
|
||||||
stream_info = None
|
stream_info: str | None = None
|
||||||
if (
|
if (
|
||||||
extension_hint is not None
|
extension_hint is not None
|
||||||
or mime_type_hint is not None
|
or mime_type_hint is not None
|
||||||
@@ -188,15 +181,9 @@ def main():
|
|||||||
markitdown = MarkItDown(enable_plugins=args.use_plugins)
|
markitdown = MarkItDown(enable_plugins=args.use_plugins)
|
||||||
|
|
||||||
if args.filename is None:
|
if args.filename is None:
|
||||||
result = markitdown.convert_stream(
|
result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
|
||||||
sys.stdin.buffer,
|
|
||||||
stream_info=stream_info,
|
|
||||||
keep_data_uris=args.keep_data_uris,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(args.filename, stream_info=stream_info)
|
||||||
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
|
|
||||||
)
|
|
||||||
|
|
||||||
_handle_output(args, result)
|
_handle_output(args, result)
|
||||||
|
|
||||||
@@ -205,14 +192,9 @@ def _handle_output(args, result: DocumentConverterResult):
|
|||||||
"""Handle output to stdout or file"""
|
"""Handle output to stdout or file"""
|
||||||
if args.output:
|
if args.output:
|
||||||
with open(args.output, "w", encoding="utf-8") as f:
|
with open(args.output, "w", encoding="utf-8") as f:
|
||||||
f.write(result.markdown)
|
f.write(result.text_content)
|
||||||
else:
|
else:
|
||||||
# Handle stdout encoding errors more gracefully
|
print(result.text_content)
|
||||||
print(
|
|
||||||
result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
|
|
||||||
sys.stdout.encoding
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _exit_with_error(message: str):
|
def _exit_with_error(message: str):
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ import charset_normalizer
|
|||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
from ._stream_info import StreamInfo
|
from ._stream_info import StreamInfo
|
||||||
from ._uri_utils import parse_data_uri, file_uri_to_path
|
|
||||||
|
|
||||||
from .converters import (
|
from .converters import (
|
||||||
PlainTextConverter,
|
PlainTextConverter,
|
||||||
@@ -39,7 +38,6 @@ from .converters import (
|
|||||||
AudioConverter,
|
AudioConverter,
|
||||||
OutlookMsgConverter,
|
OutlookMsgConverter,
|
||||||
ZipConverter,
|
ZipConverter,
|
||||||
EpubConverter,
|
|
||||||
DocumentIntelligenceConverter,
|
DocumentIntelligenceConverter,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -193,7 +191,6 @@ class MarkItDown:
|
|||||||
self.register_converter(IpynbConverter())
|
self.register_converter(IpynbConverter())
|
||||||
self.register_converter(PdfConverter())
|
self.register_converter(PdfConverter())
|
||||||
self.register_converter(OutlookMsgConverter())
|
self.register_converter(OutlookMsgConverter())
|
||||||
self.register_converter(EpubConverter())
|
|
||||||
|
|
||||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||||
docintel_endpoint = kwargs.get("docintel_endpoint")
|
docintel_endpoint = kwargs.get("docintel_endpoint")
|
||||||
@@ -243,10 +240,9 @@ class MarkItDown:
|
|||||||
# Local path or url
|
# Local path or url
|
||||||
if isinstance(source, str):
|
if isinstance(source, str):
|
||||||
if (
|
if (
|
||||||
source.startswith("http:")
|
source.startswith("http://")
|
||||||
or source.startswith("https:")
|
or source.startswith("https://")
|
||||||
or source.startswith("file:")
|
or source.startswith("file://")
|
||||||
or source.startswith("data:")
|
|
||||||
):
|
):
|
||||||
# Rename the url argument to mock_url
|
# Rename the url argument to mock_url
|
||||||
# (Deprecated -- use stream_info)
|
# (Deprecated -- use stream_info)
|
||||||
@@ -255,7 +251,7 @@ class MarkItDown:
|
|||||||
_kwargs["mock_url"] = _kwargs["url"]
|
_kwargs["mock_url"] = _kwargs["url"]
|
||||||
del _kwargs["url"]
|
del _kwargs["url"]
|
||||||
|
|
||||||
return self.convert_uri(source, stream_info=stream_info, **_kwargs)
|
return self.convert_url(source, stream_info=stream_info, **_kwargs)
|
||||||
else:
|
else:
|
||||||
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||||
# Path object
|
# Path object
|
||||||
@@ -365,80 +361,22 @@ class MarkItDown:
|
|||||||
url: str,
|
url: str,
|
||||||
*,
|
*,
|
||||||
stream_info: Optional[StreamInfo] = None,
|
stream_info: Optional[StreamInfo] = None,
|
||||||
file_extension: Optional[str] = None,
|
|
||||||
mock_url: Optional[str] = None,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> DocumentConverterResult:
|
|
||||||
"""Alias for convert_uri()"""
|
|
||||||
# convert_url will likely be deprecated in the future in favor of convert_uri
|
|
||||||
return self.convert_uri(
|
|
||||||
url,
|
|
||||||
stream_info=stream_info,
|
|
||||||
file_extension=file_extension,
|
|
||||||
mock_url=mock_url,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
def convert_uri(
|
|
||||||
self,
|
|
||||||
uri: str,
|
|
||||||
*,
|
|
||||||
stream_info: Optional[StreamInfo] = None,
|
|
||||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
mock_url: Optional[
|
mock_url: Optional[
|
||||||
str
|
str
|
||||||
] = None, # Mock the request as if it came from a different URL
|
] = None, # Mock the request as if it came from a different URL
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult: # TODO: fix kwargs type
|
||||||
uri = uri.strip()
|
# Send a HTTP request to the URL
|
||||||
|
response = self._requests_session.get(url, stream=True)
|
||||||
# File URIs
|
response.raise_for_status()
|
||||||
if uri.startswith("file:"):
|
return self.convert_response(
|
||||||
netloc, path = file_uri_to_path(uri)
|
response,
|
||||||
if netloc and netloc != "localhost":
|
stream_info=stream_info,
|
||||||
raise ValueError(
|
file_extension=file_extension,
|
||||||
f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
|
url=mock_url,
|
||||||
)
|
**kwargs,
|
||||||
return self.convert_local(
|
)
|
||||||
path,
|
|
||||||
stream_info=stream_info,
|
|
||||||
file_extension=file_extension,
|
|
||||||
url=mock_url,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
# Data URIs
|
|
||||||
elif uri.startswith("data:"):
|
|
||||||
mimetype, attributes, data = parse_data_uri(uri)
|
|
||||||
|
|
||||||
base_guess = StreamInfo(
|
|
||||||
mimetype=mimetype,
|
|
||||||
charset=attributes.get("charset"),
|
|
||||||
)
|
|
||||||
if stream_info is not None:
|
|
||||||
base_guess = base_guess.copy_and_update(stream_info)
|
|
||||||
|
|
||||||
return self.convert_stream(
|
|
||||||
io.BytesIO(data),
|
|
||||||
stream_info=base_guess,
|
|
||||||
file_extension=file_extension,
|
|
||||||
url=mock_url,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
# HTTP/HTTPS URIs
|
|
||||||
elif uri.startswith("http:") or uri.startswith("https:"):
|
|
||||||
response = self._requests_session.get(uri, stream=True)
|
|
||||||
response.raise_for_status()
|
|
||||||
return self.convert_response(
|
|
||||||
response,
|
|
||||||
stream_info=stream_info,
|
|
||||||
file_extension=file_extension,
|
|
||||||
url=mock_url,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
|
|
||||||
)
|
|
||||||
|
|
||||||
def convert_response(
|
def convert_response(
|
||||||
self,
|
self,
|
||||||
@@ -672,16 +610,14 @@ class MarkItDown:
|
|||||||
# Call magika to guess from the stream
|
# Call magika to guess from the stream
|
||||||
cur_pos = file_stream.tell()
|
cur_pos = file_stream.tell()
|
||||||
try:
|
try:
|
||||||
result = self._magika.identify_stream(file_stream)
|
stream_bytes = file_stream.read()
|
||||||
|
|
||||||
|
result = self._magika.identify_bytes(stream_bytes)
|
||||||
if result.status == "ok" and result.prediction.output.label != "unknown":
|
if result.status == "ok" and result.prediction.output.label != "unknown":
|
||||||
# If it's text, also guess the charset
|
# If it's text, also guess the charset
|
||||||
charset = None
|
charset = None
|
||||||
if result.prediction.output.is_text:
|
if result.prediction.output.is_text:
|
||||||
# Read the first 4k to guess the charset
|
charset_result = charset_normalizer.from_bytes(stream_bytes).best()
|
||||||
file_stream.seek(cur_pos)
|
|
||||||
stream_page = file_stream.read(4096)
|
|
||||||
charset_result = charset_normalizer.from_bytes(stream_page).best()
|
|
||||||
|
|
||||||
if charset_result is not None:
|
if charset_result is not None:
|
||||||
charset = self._normalize_charset(charset_result.encoding)
|
charset = self._normalize_charset(charset_result.encoding)
|
||||||
|
|
||||||
|
|||||||
@@ -1,52 +0,0 @@
|
|||||||
import base64
|
|
||||||
import os
|
|
||||||
from typing import Tuple, Dict
|
|
||||||
from urllib.request import url2pathname
|
|
||||||
from urllib.parse import urlparse, unquote_to_bytes
|
|
||||||
|
|
||||||
|
|
||||||
def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
|
|
||||||
"""Convert a file URI to a local file path"""
|
|
||||||
parsed = urlparse(file_uri)
|
|
||||||
if parsed.scheme != "file":
|
|
||||||
raise ValueError(f"Not a file URL: {file_uri}")
|
|
||||||
|
|
||||||
netloc = parsed.netloc if parsed.netloc else None
|
|
||||||
path = os.path.abspath(url2pathname(parsed.path))
|
|
||||||
return netloc, path
|
|
||||||
|
|
||||||
|
|
||||||
def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
|
|
||||||
if not uri.startswith("data:"):
|
|
||||||
raise ValueError("Not a data URI")
|
|
||||||
|
|
||||||
header, _, data = uri.partition(",")
|
|
||||||
if not _:
|
|
||||||
raise ValueError("Malformed data URI, missing ',' separator")
|
|
||||||
|
|
||||||
meta = header[5:] # Strip 'data:'
|
|
||||||
parts = meta.split(";")
|
|
||||||
|
|
||||||
is_base64 = False
|
|
||||||
# Ends with base64?
|
|
||||||
if parts[-1] == "base64":
|
|
||||||
parts.pop()
|
|
||||||
is_base64 = True
|
|
||||||
|
|
||||||
mime_type = None # Normally this would default to text/plain but we won't assume
|
|
||||||
if len(parts) and len(parts[0]) > 0:
|
|
||||||
# First part is the mime type
|
|
||||||
mime_type = parts.pop(0)
|
|
||||||
|
|
||||||
attributes: Dict[str, str] = {}
|
|
||||||
for part in parts:
|
|
||||||
# Handle key=value pairs in the middle
|
|
||||||
if "=" in part:
|
|
||||||
key, value = part.split("=", 1)
|
|
||||||
attributes[key] = value
|
|
||||||
elif len(part) > 0:
|
|
||||||
attributes[part] = ""
|
|
||||||
|
|
||||||
content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)
|
|
||||||
|
|
||||||
return mime_type, attributes, content
|
|
||||||
@@ -18,7 +18,6 @@ from ._audio_converter import AudioConverter
|
|||||||
from ._outlook_msg_converter import OutlookMsgConverter
|
from ._outlook_msg_converter import OutlookMsgConverter
|
||||||
from ._zip_converter import ZipConverter
|
from ._zip_converter import ZipConverter
|
||||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||||
from ._epub_converter import EpubConverter
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"PlainTextConverter",
|
"PlainTextConverter",
|
||||||
@@ -38,5 +37,4 @@ __all__ = [
|
|||||||
"OutlookMsgConverter",
|
"OutlookMsgConverter",
|
||||||
"ZipConverter",
|
"ZipConverter",
|
||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
"EpubConverter",
|
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import base64
|
import base64
|
||||||
import binascii
|
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from typing import Any, BinaryIO, Optional
|
from typing import Any, BinaryIO, Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@@ -61,8 +60,6 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
assert stream_info.url is not None
|
|
||||||
|
|
||||||
# Parse the query parameters
|
# Parse the query parameters
|
||||||
parsed_params = parse_qs(urlparse(stream_info.url).query)
|
parsed_params = parse_qs(urlparse(stream_info.url).query)
|
||||||
query = parsed_params.get("q", [""])[0]
|
query = parsed_params.get("q", [""])[0]
|
||||||
@@ -79,12 +76,9 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
slug.extract()
|
slug.extract()
|
||||||
|
|
||||||
# Parse the algorithmic results
|
# Parse the algorithmic results
|
||||||
_markdownify = _CustomMarkdownify(**kwargs)
|
_markdownify = _CustomMarkdownify()
|
||||||
results = list()
|
results = list()
|
||||||
for result in soup.find_all(class_="b_algo"):
|
for result in soup.find_all(class_="b_algo"):
|
||||||
if not hasattr(result, "find_all"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Rewrite redirect urls
|
# Rewrite redirect urls
|
||||||
for a in result.find_all("a", href=True):
|
for a in result.find_all("a", href=True):
|
||||||
parsed_href = urlparse(a["href"])
|
parsed_href = urlparse(a["href"])
|
||||||
|
|||||||
@@ -73,5 +73,5 @@ class DocxConverter(HtmlConverter):
|
|||||||
|
|
||||||
style_map = kwargs.get("style_map", None)
|
style_map = kwargs.get("style_map", None)
|
||||||
return self._html_converter.convert_string(
|
return self._html_converter.convert_string(
|
||||||
mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
|
mammoth.convert_to_html(file_stream, style_map=style_map).value
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,147 +0,0 @@
|
|||||||
import os
|
|
||||||
import zipfile
|
|
||||||
import xml.dom.minidom as minidom
|
|
||||||
|
|
||||||
from typing import BinaryIO, Any, Dict, List
|
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
|
||||||
from .._stream_info import StreamInfo
|
|
||||||
|
|
||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
||||||
"application/epub",
|
|
||||||
"application/epub+zip",
|
|
||||||
"application/x-epub+zip",
|
|
||||||
]
|
|
||||||
|
|
||||||
ACCEPTED_FILE_EXTENSIONS = [".epub"]
|
|
||||||
|
|
||||||
MIME_TYPE_MAPPING = {
|
|
||||||
".html": "text/html",
|
|
||||||
".xhtml": "application/xhtml+xml",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class EpubConverter(HtmlConverter):
|
|
||||||
"""
|
|
||||||
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
self._html_converter = HtmlConverter()
|
|
||||||
|
|
||||||
def accepts(
|
|
||||||
self,
|
|
||||||
file_stream: BinaryIO,
|
|
||||||
stream_info: StreamInfo,
|
|
||||||
**kwargs: Any, # Options to pass to the converter
|
|
||||||
) -> bool:
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
|
||||||
extension = (stream_info.extension or "").lower()
|
|
||||||
|
|
||||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
||||||
return True
|
|
||||||
|
|
||||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
||||||
if mimetype.startswith(prefix):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self,
|
|
||||||
file_stream: BinaryIO,
|
|
||||||
stream_info: StreamInfo,
|
|
||||||
**kwargs: Any, # Options to pass to the converter
|
|
||||||
) -> DocumentConverterResult:
|
|
||||||
with zipfile.ZipFile(file_stream, "r") as z:
|
|
||||||
# Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
|
|
||||||
|
|
||||||
# Locate content.opf
|
|
||||||
container_dom = minidom.parse(z.open("META-INF/container.xml"))
|
|
||||||
opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
|
|
||||||
"full-path"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Parse content.opf
|
|
||||||
opf_dom = minidom.parse(z.open(opf_path))
|
|
||||||
metadata: Dict[str, Any] = {
|
|
||||||
"title": self._get_text_from_node(opf_dom, "dc:title"),
|
|
||||||
"authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
|
|
||||||
"language": self._get_text_from_node(opf_dom, "dc:language"),
|
|
||||||
"publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
|
|
||||||
"date": self._get_text_from_node(opf_dom, "dc:date"),
|
|
||||||
"description": self._get_text_from_node(opf_dom, "dc:description"),
|
|
||||||
"identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extract manifest items (ID → href mapping)
|
|
||||||
manifest = {
|
|
||||||
item.getAttribute("id"): item.getAttribute("href")
|
|
||||||
for item in opf_dom.getElementsByTagName("item")
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extract spine order (ID refs)
|
|
||||||
spine_items = opf_dom.getElementsByTagName("itemref")
|
|
||||||
spine_order = [item.getAttribute("idref") for item in spine_items]
|
|
||||||
|
|
||||||
# Convert spine order to actual file paths
|
|
||||||
base_path = "/".join(
|
|
||||||
opf_path.split("/")[:-1]
|
|
||||||
) # Get base directory of content.opf
|
|
||||||
spine = [
|
|
||||||
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
|
|
||||||
for item_id in spine_order
|
|
||||||
if item_id in manifest
|
|
||||||
]
|
|
||||||
|
|
||||||
# Extract and convert the content
|
|
||||||
markdown_content: List[str] = []
|
|
||||||
for file in spine:
|
|
||||||
if file in z.namelist():
|
|
||||||
with z.open(file) as f:
|
|
||||||
filename = os.path.basename(file)
|
|
||||||
extension = os.path.splitext(filename)[1].lower()
|
|
||||||
mimetype = MIME_TYPE_MAPPING.get(extension)
|
|
||||||
converted_content = self._html_converter.convert(
|
|
||||||
f,
|
|
||||||
StreamInfo(
|
|
||||||
mimetype=mimetype,
|
|
||||||
extension=extension,
|
|
||||||
filename=filename,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
markdown_content.append(converted_content.markdown.strip())
|
|
||||||
|
|
||||||
# Format and add the metadata
|
|
||||||
metadata_markdown = []
|
|
||||||
for key, value in metadata.items():
|
|
||||||
if isinstance(value, list):
|
|
||||||
value = ", ".join(value)
|
|
||||||
if value:
|
|
||||||
metadata_markdown.append(f"**{key.capitalize()}:** {value}")
|
|
||||||
|
|
||||||
markdown_content.insert(0, "\n".join(metadata_markdown))
|
|
||||||
|
|
||||||
return DocumentConverterResult(
|
|
||||||
markdown="\n\n".join(markdown_content), title=metadata["title"]
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
|
|
||||||
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
|
|
||||||
texts = self._get_all_texts_from_nodes(dom, tag_name)
|
|
||||||
if len(texts) > 0:
|
|
||||||
return texts[0]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_all_texts_from_nodes(
|
|
||||||
self, dom: minidom.Document, tag_name: str
|
|
||||||
) -> List[str]:
|
|
||||||
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
|
|
||||||
texts: List[str] = []
|
|
||||||
for node in dom.getElementsByTagName(tag_name):
|
|
||||||
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
|
|
||||||
texts.append(node.firstChild.nodeValue.strip())
|
|
||||||
return texts
|
|
||||||
@@ -56,9 +56,9 @@ class HtmlConverter(DocumentConverter):
|
|||||||
body_elm = soup.find("body")
|
body_elm = soup.find("body")
|
||||||
webpage_text = ""
|
webpage_text = ""
|
||||||
if body_elm:
|
if body_elm:
|
||||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
|
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
|
||||||
else:
|
else:
|
||||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
||||||
|
|
||||||
assert isinstance(webpage_text, str)
|
assert isinstance(webpage_text, str)
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
|
|
||||||
def __init__(self, **options: Any):
|
def __init__(self, **options: Any):
|
||||||
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
||||||
options["keep_data_uris"] = options.get("keep_data_uris", False)
|
|
||||||
# Explicitly cast options to the expected type if necessary
|
# Explicitly cast options to the expected type if necessary
|
||||||
super().__init__(**options)
|
super().__init__(**options)
|
||||||
|
|
||||||
@@ -102,7 +101,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
return alt
|
return alt
|
||||||
|
|
||||||
# Remove dataURIs
|
# Remove dataURIs
|
||||||
if src.startswith("data:") and not self.options["keep_data_uris"]:
|
if src.startswith("data:"):
|
||||||
src = src.split(",")[0] + "..."
|
src = src.split(",")[0] + "..."
|
||||||
|
|
||||||
return "" % (alt, src, title_part)
|
return "" % (alt, src, title_part)
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|||||||
_dependency_exc_info = None
|
_dependency_exc_info = None
|
||||||
olefile = None
|
olefile = None
|
||||||
try:
|
try:
|
||||||
import olefile # type: ignore[no-redef]
|
import olefile
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# Preserve the error and stack trace for later
|
# Preserve the error and stack trace for later
|
||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
@@ -56,13 +56,12 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Brue force, check if it's an Outlook file
|
# Brue force, check if it's an Outlook file
|
||||||
try:
|
try:
|
||||||
if olefile is not None:
|
msg = olefile.OleFileIO(file_stream)
|
||||||
msg = olefile.OleFileIO(file_stream)
|
toc = "\n".join([str(stream) for stream in msg.listdir()])
|
||||||
toc = "\n".join([str(stream) for stream in msg.listdir()])
|
return (
|
||||||
return (
|
"__properties_version1.0" in toc
|
||||||
"__properties_version1.0" in toc
|
and "__recip_version1.0_#00000000" in toc
|
||||||
and "__recip_version1.0_#00000000" in toc
|
)
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
@@ -90,11 +89,7 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
|
||||||
olefile is not None
|
|
||||||
) # If we made it this far, olefile should be available
|
|
||||||
msg = olefile.OleFileIO(file_stream)
|
msg = olefile.OleFileIO(file_stream)
|
||||||
|
|
||||||
# Extract email metadata
|
# Extract email metadata
|
||||||
md_content = "# Email Message\n\n"
|
md_content = "# Email Message\n\n"
|
||||||
|
|
||||||
@@ -126,7 +121,6 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
|
|
||||||
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
||||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||||
assert olefile is not None
|
|
||||||
assert isinstance(
|
assert isinstance(
|
||||||
msg, olefile.OleFileIO
|
msg, olefile.OleFileIO
|
||||||
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
|
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
|
||||||
|
|||||||
@@ -17,16 +17,12 @@ except ImportError:
|
|||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
"text/",
|
"text/",
|
||||||
"application/json",
|
"application/json",
|
||||||
"application/markdown",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
ACCEPTED_FILE_EXTENSIONS = [
|
# Mimetypes to ignore (commonly confused extensions)
|
||||||
".txt",
|
IGNORE_MIME_TYPE_PREFIXES = [
|
||||||
".text",
|
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
|
||||||
".md",
|
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
|
||||||
".markdown",
|
|
||||||
".json",
|
|
||||||
".jsonl",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -42,14 +38,9 @@ class PlainTextConverter(DocumentConverter):
|
|||||||
mimetype = (stream_info.mimetype or "").lower()
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
extension = (stream_info.extension or "").lower()
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
# If we have a charset, we can safely assume it's text
|
for prefix in IGNORE_MIME_TYPE_PREFIXES:
|
||||||
# With Magika in the earlier stages, this handles most cases
|
if mimetype.startswith(prefix):
|
||||||
if stream_info.charset is not None:
|
return False
|
||||||
return True
|
|
||||||
|
|
||||||
# Otherwise, check the mimetype and extension
|
|
||||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
||||||
return True
|
|
||||||
|
|
||||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
if mimetype.startswith(prefix):
|
if mimetype.startswith(prefix):
|
||||||
|
|||||||
@@ -140,20 +140,13 @@ class PptxConverter(DocumentConverter):
|
|||||||
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
|
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
|
||||||
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
||||||
|
|
||||||
# If keep_data_uris is True, use base64 encoding for images
|
# A placeholder name
|
||||||
if kwargs.get("keep_data_uris", False):
|
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||||
blob = shape.image.blob
|
md_content += "\n\n"
|
||||||
content_type = shape.image.content_type or "image/png"
|
|
||||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
|
||||||
md_content += f"\n\n"
|
|
||||||
else:
|
|
||||||
# A placeholder name
|
|
||||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
|
||||||
md_content += "\n\n"
|
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
if self._is_table(shape):
|
if self._is_table(shape):
|
||||||
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
|
md_content += self._convert_table_to_markdown(shape.table)
|
||||||
|
|
||||||
# Charts
|
# Charts
|
||||||
if shape.has_chart:
|
if shape.has_chart:
|
||||||
@@ -200,7 +193,7 @@ class PptxConverter(DocumentConverter):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _convert_table_to_markdown(self, table, **kwargs):
|
def _convert_table_to_markdown(self, table):
|
||||||
# Write the table as HTML, then convert it to Markdown
|
# Write the table as HTML, then convert it to Markdown
|
||||||
html_table = "<html><body><table>"
|
html_table = "<html><body><table>"
|
||||||
first_row = True
|
first_row = True
|
||||||
@@ -215,10 +208,7 @@ class PptxConverter(DocumentConverter):
|
|||||||
first_row = False
|
first_row = False
|
||||||
html_table += "</table></body></html>"
|
html_table += "</table></body></html>"
|
||||||
|
|
||||||
return (
|
return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
|
||||||
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
|
|
||||||
+ "\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
def _convert_chart_to_markdown(self, chart):
|
def _convert_chart_to_markdown(self, chart):
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -28,10 +28,6 @@ CANDIDATE_FILE_EXTENSIONS = [
|
|||||||
class RssConverter(DocumentConverter):
|
class RssConverter(DocumentConverter):
|
||||||
"""Convert RSS / Atom type to markdown"""
|
"""Convert RSS / Atom type to markdown"""
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
self._kwargs = {}
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
@@ -70,7 +66,7 @@ class RssConverter(DocumentConverter):
|
|||||||
file_stream.seek(cur_pos)
|
file_stream.seek(cur_pos)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _feed_type(self, doc: Any) -> str | None:
|
def _feed_type(self, doc: Any) -> str:
|
||||||
if doc.getElementsByTagName("rss"):
|
if doc.getElementsByTagName("rss"):
|
||||||
return "rss"
|
return "rss"
|
||||||
elif doc.getElementsByTagName("feed"):
|
elif doc.getElementsByTagName("feed"):
|
||||||
@@ -86,7 +82,6 @@ class RssConverter(DocumentConverter):
|
|||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
self._kwargs = kwargs
|
|
||||||
doc = minidom.parse(file_stream)
|
doc = minidom.parse(file_stream)
|
||||||
feed_type = self._feed_type(doc)
|
feed_type = self._feed_type(doc)
|
||||||
|
|
||||||
@@ -135,10 +130,10 @@ class RssConverter(DocumentConverter):
|
|||||||
Returns None if the feed type is not recognized or something goes wrong.
|
Returns None if the feed type is not recognized or something goes wrong.
|
||||||
"""
|
"""
|
||||||
root = doc.getElementsByTagName("rss")[0]
|
root = doc.getElementsByTagName("rss")[0]
|
||||||
channel_list = root.getElementsByTagName("channel")
|
channel = root.getElementsByTagName("channel")
|
||||||
if not channel_list:
|
if not channel:
|
||||||
raise ValueError("No channel found in RSS feed")
|
return None
|
||||||
channel = channel_list[0]
|
channel = channel[0]
|
||||||
channel_title = self._get_data_by_tag_name(channel, "title")
|
channel_title = self._get_data_by_tag_name(channel, "title")
|
||||||
channel_description = self._get_data_by_tag_name(channel, "description")
|
channel_description = self._get_data_by_tag_name(channel, "description")
|
||||||
items = channel.getElementsByTagName("item")
|
items = channel.getElementsByTagName("item")
|
||||||
@@ -146,6 +141,8 @@ class RssConverter(DocumentConverter):
|
|||||||
md_text = f"# {channel_title}\n"
|
md_text = f"# {channel_title}\n"
|
||||||
if channel_description:
|
if channel_description:
|
||||||
md_text += f"{channel_description}\n"
|
md_text += f"{channel_description}\n"
|
||||||
|
if not items:
|
||||||
|
items = []
|
||||||
for item in items:
|
for item in items:
|
||||||
title = self._get_data_by_tag_name(item, "title")
|
title = self._get_data_by_tag_name(item, "title")
|
||||||
description = self._get_data_by_tag_name(item, "description")
|
description = self._get_data_by_tag_name(item, "description")
|
||||||
@@ -171,7 +168,7 @@ class RssConverter(DocumentConverter):
|
|||||||
try:
|
try:
|
||||||
# using bs4 because many RSS feeds have HTML-styled content
|
# using bs4 because many RSS feeds have HTML-styled content
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
|
return _CustomMarkdownify().convert_soup(soup)
|
||||||
except BaseException as _:
|
except BaseException as _:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
@@ -186,6 +183,5 @@ class RssConverter(DocumentConverter):
|
|||||||
return None
|
return None
|
||||||
fc = nodes[0].firstChild
|
fc = nodes[0].firstChild
|
||||||
if fc:
|
if fc:
|
||||||
if hasattr(fc, "data"):
|
return fc.data
|
||||||
return fc.data
|
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -7,14 +7,8 @@ from .._exceptions import MissingDependencyException
|
|||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
_dependency_exc_info = None
|
_dependency_exc_info = None
|
||||||
try:
|
try:
|
||||||
# Suppress some warnings on library import
|
import speech_recognition as sr
|
||||||
import warnings
|
import pydub
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
||||||
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
|
||||||
import speech_recognition as sr
|
|
||||||
import pydub
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# Preserve the error and stack trace for later
|
# Preserve the error and stack trace for later
|
||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import bs4
|
|
||||||
from typing import Any, BinaryIO, Optional
|
from typing import Any, BinaryIO, Optional
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
@@ -57,7 +57,7 @@ class WikipediaConverter(DocumentConverter):
|
|||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Parse the stream
|
# Parse the stream
|
||||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
@@ -72,15 +72,16 @@ class WikipediaConverter(DocumentConverter):
|
|||||||
|
|
||||||
if body_elm:
|
if body_elm:
|
||||||
# What's the title
|
# What's the title
|
||||||
if title_elm and isinstance(title_elm, bs4.Tag):
|
if title_elm and len(title_elm) > 0:
|
||||||
main_title = title_elm.string
|
main_title = title_elm.string # type: ignore
|
||||||
|
assert isinstance(main_title, str)
|
||||||
|
|
||||||
# Convert the page
|
# Convert the page
|
||||||
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
|
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
|
||||||
**kwargs
|
body_elm
|
||||||
).convert_soup(body_elm)
|
)
|
||||||
else:
|
else:
|
||||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=webpage_text,
|
markdown=webpage_text,
|
||||||
|
|||||||
@@ -86,9 +86,7 @@ class XlsxConverter(DocumentConverter):
|
|||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
html_content = sheets[s].to_html(index=False)
|
html_content = sheets[s].to_html(index=False)
|
||||||
md_content += (
|
md_content += (
|
||||||
self._html_converter.convert_string(
|
self._html_converter.convert_string(html_content).markdown.strip()
|
||||||
html_content, **kwargs
|
|
||||||
).markdown.strip()
|
|
||||||
+ "\n\n"
|
+ "\n\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -148,9 +146,7 @@ class XlsConverter(DocumentConverter):
|
|||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
html_content = sheets[s].to_html(index=False)
|
html_content = sheets[s].to_html(index=False)
|
||||||
md_content += (
|
md_content += (
|
||||||
self._html_converter.convert_string(
|
self._html_converter.convert_string(html_content).markdown.strip()
|
||||||
html_content, **kwargs
|
|
||||||
).markdown.strip()
|
|
||||||
+ "\n\n"
|
+ "\n\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -3,22 +3,17 @@ import json
|
|||||||
import time
|
import time
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import bs4
|
|
||||||
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
||||||
from urllib.parse import parse_qs, urlparse, unquote
|
from urllib.parse import parse_qs, urlparse, unquote
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
# Optional YouTube transcription support
|
# Optional YouTube transcription support
|
||||||
try:
|
try:
|
||||||
# Suppress some warnings on library import
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
import warnings
|
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
|
||||||
# Patch submitted upstream to fix the SyntaxWarning
|
|
||||||
from youtube_transcript_api import YouTubeTranscriptApi
|
|
||||||
|
|
||||||
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
|
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
@@ -77,31 +72,21 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Parse the stream
|
# Parse the stream
|
||||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||||
|
|
||||||
# Read the meta tags
|
# Read the meta tags
|
||||||
metadata: Dict[str, str] = {}
|
metadata: Dict[str, str] = {"title": soup.title.string}
|
||||||
|
|
||||||
if soup.title and soup.title.string:
|
|
||||||
metadata["title"] = soup.title.string
|
|
||||||
|
|
||||||
for meta in soup(["meta"]):
|
for meta in soup(["meta"]):
|
||||||
if not isinstance(meta, bs4.Tag):
|
|
||||||
continue
|
|
||||||
|
|
||||||
for a in meta.attrs:
|
for a in meta.attrs:
|
||||||
if a in ["itemprop", "property", "name"]:
|
if a in ["itemprop", "property", "name"]:
|
||||||
key = str(meta.get(a, ""))
|
content = meta.get("content", "")
|
||||||
content = str(meta.get("content", ""))
|
if content: # Only add non-empty content
|
||||||
if key and content: # Only add non-empty content
|
metadata[meta[a]] = content
|
||||||
metadata[key] = content
|
|
||||||
break
|
break
|
||||||
|
|
||||||
# Try reading the description
|
# Try reading the description
|
||||||
try:
|
try:
|
||||||
for script in soup(["script"]):
|
for script in soup(["script"]):
|
||||||
if not isinstance(script, bs4.Tag):
|
|
||||||
continue
|
|
||||||
if not script.string: # Skip empty scripts
|
if not script.string: # Skip empty scripts
|
||||||
continue
|
continue
|
||||||
content = script.string
|
content = script.string
|
||||||
@@ -147,7 +132,6 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
webpage_text += f"\n### Description\n{description}\n"
|
webpage_text += f"\n### Description\n{description}\n"
|
||||||
|
|
||||||
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
||||||
ytt_api = YouTubeTranscriptApi()
|
|
||||||
transcript_text = ""
|
transcript_text = ""
|
||||||
parsed_url = urlparse(stream_info.url) # type: ignore
|
parsed_url = urlparse(stream_info.url) # type: ignore
|
||||||
params = parse_qs(parsed_url.query) # type: ignore
|
params = parse_qs(parsed_url.query) # type: ignore
|
||||||
@@ -159,7 +143,7 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
)
|
)
|
||||||
# Retry the transcript fetching operation
|
# Retry the transcript fetching operation
|
||||||
transcript = self._retry_operation(
|
transcript = self._retry_operation(
|
||||||
lambda: ytt_api.fetch(
|
lambda: YouTubeTranscriptApi.get_transcript(
|
||||||
video_id, languages=youtube_transcript_languages
|
video_id, languages=youtube_transcript_languages
|
||||||
),
|
),
|
||||||
retries=3, # Retry 3 times
|
retries=3, # Retry 3 times
|
||||||
@@ -167,14 +151,17 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
)
|
)
|
||||||
if transcript:
|
if transcript:
|
||||||
transcript_text = " ".join(
|
transcript_text = " ".join(
|
||||||
[part.text for part in transcript]
|
[part["text"] for part in transcript]
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
|
# Alternative formatting:
|
||||||
|
# formatter = TextFormatter()
|
||||||
|
# formatter.format_transcript(transcript)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error fetching transcript: {e}")
|
print(f"Error fetching transcript: {e}")
|
||||||
if transcript_text:
|
if transcript_text:
|
||||||
webpage_text += f"\n### Transcript\n{transcript_text}\n"
|
webpage_text += f"\n### Transcript\n{transcript_text}\n"
|
||||||
|
|
||||||
title = title if title else (soup.title.string if soup.title else "")
|
title = title if title else soup.title.string
|
||||||
assert isinstance(title, str)
|
assert isinstance(title, str)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
|||||||
@@ -25,11 +25,8 @@ GENERAL_TEST_VECTORS = [
|
|||||||
"# Abstract",
|
"# Abstract",
|
||||||
"# Introduction",
|
"# Introduction",
|
||||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||||
"data:image/png;base64...",
|
|
||||||
],
|
|
||||||
must_not_include=[
|
|
||||||
"data:image/png;base64,iVBORw0KGgoAAAANSU",
|
|
||||||
],
|
],
|
||||||
|
must_not_include=[],
|
||||||
),
|
),
|
||||||
FileTestVector(
|
FileTestVector(
|
||||||
filename="test.xlsx",
|
filename="test.xlsx",
|
||||||
@@ -68,9 +65,8 @@ GENERAL_TEST_VECTORS = [
|
|||||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||||
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
|
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
|
||||||
"2003", # chart value
|
"2003", # chart value
|
||||||
"",
|
|
||||||
],
|
],
|
||||||
must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
|
must_not_include=[],
|
||||||
),
|
),
|
||||||
FileTestVector(
|
FileTestVector(
|
||||||
filename="test_outlook_msg.msg",
|
filename="test_outlook_msg.msg",
|
||||||
@@ -215,64 +211,4 @@ GENERAL_TEST_VECTORS = [
|
|||||||
],
|
],
|
||||||
must_not_include=[],
|
must_not_include=[],
|
||||||
),
|
),
|
||||||
FileTestVector(
|
|
||||||
filename="test.epub",
|
|
||||||
mimetype="application/epub+zip",
|
|
||||||
charset=None,
|
|
||||||
url=None,
|
|
||||||
must_include=[
|
|
||||||
"**Authors:** Test Author",
|
|
||||||
"A test EPUB document for MarkItDown testing",
|
|
||||||
"# Chapter 1: Test Content",
|
|
||||||
"This is a **test** paragraph with some formatting",
|
|
||||||
"* A bullet point",
|
|
||||||
"* Another point",
|
|
||||||
"# Chapter 2: More Content",
|
|
||||||
"*different* style",
|
|
||||||
"> This is a blockquote for testing",
|
|
||||||
],
|
|
||||||
must_not_include=[],
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
DATA_URI_TEST_VECTORS = [
|
|
||||||
FileTestVector(
|
|
||||||
filename="test.docx",
|
|
||||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
||||||
charset=None,
|
|
||||||
url=None,
|
|
||||||
must_include=[
|
|
||||||
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
|
|
||||||
"49e168b7-d2ae-407f-a055-2167576f39a1",
|
|
||||||
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
|
|
||||||
"# Abstract",
|
|
||||||
"# Introduction",
|
|
||||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
|
||||||
"data:image/png;base64,iVBORw0KGgoAAAANSU",
|
|
||||||
],
|
|
||||||
must_not_include=[
|
|
||||||
"data:image/png;base64...",
|
|
||||||
],
|
|
||||||
),
|
|
||||||
FileTestVector(
|
|
||||||
filename="test.pptx",
|
|
||||||
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
||||||
charset=None,
|
|
||||||
url=None,
|
|
||||||
must_include=[
|
|
||||||
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
|
|
||||||
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
|
|
||||||
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
|
|
||||||
"1b92870d-e3b5-4e65-8153-919f4ff45592",
|
|
||||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
|
||||||
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
|
|
||||||
"2003", # chart value
|
|
||||||
"![This phrase of the caption is Human-written.]", # image caption
|
|
||||||
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
|
|
||||||
],
|
|
||||||
must_not_include=[
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -7,17 +7,9 @@ import locale
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
from _test_vectors import (
|
from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
|
||||||
GENERAL_TEST_VECTORS,
|
|
||||||
DATA_URI_TEST_VECTORS,
|
|
||||||
FileTestVector,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
from ._test_vectors import (
|
from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector
|
||||||
GENERAL_TEST_VECTORS,
|
|
||||||
DATA_URI_TEST_VECTORS,
|
|
||||||
FileTestVector,
|
|
||||||
)
|
|
||||||
|
|
||||||
from markitdown import (
|
from markitdown import (
|
||||||
MarkItDown,
|
MarkItDown,
|
||||||
@@ -122,9 +114,7 @@ def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
stdout = result.stdout.decode(locale.getpreferredencoding())
|
stdout = result.stdout.decode(locale.getpreferredencoding())
|
||||||
assert (
|
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
||||||
result.returncode == 0
|
|
||||||
), f"CLI exited with error: {result.stderr.decode('utf-8')}"
|
|
||||||
for test_string in test_vector.must_include:
|
for test_string in test_vector.must_include:
|
||||||
assert test_string in stdout
|
assert test_string in stdout
|
||||||
for test_string in test_vector.must_not_include:
|
for test_string in test_vector.must_not_include:
|
||||||
@@ -157,39 +147,6 @@ def test_convert_url(shared_tmp_dir, test_vector):
|
|||||||
assert test_string not in stdout
|
assert test_string not in stdout
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
|
||||||
def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
|
|
||||||
"""Test CLI functionality when keep_data_uris is enabled"""
|
|
||||||
|
|
||||||
output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
|
|
||||||
result = subprocess.run(
|
|
||||||
[
|
|
||||||
"python",
|
|
||||||
"-m",
|
|
||||||
"markitdown",
|
|
||||||
"--keep-data-uris",
|
|
||||||
"-o",
|
|
||||||
output_file,
|
|
||||||
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
|
||||||
],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
|
||||||
assert os.path.exists(output_file), f"Output file not created: {output_file}"
|
|
||||||
|
|
||||||
with open(output_file, "r") as f:
|
|
||||||
output_data = f.read()
|
|
||||||
for test_string in test_vector.must_include:
|
|
||||||
assert test_string in output_data
|
|
||||||
for test_string in test_vector.must_not_include:
|
|
||||||
assert test_string not in output_data
|
|
||||||
|
|
||||||
os.remove(output_file)
|
|
||||||
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
@@ -197,7 +154,6 @@ if __name__ == "__main__":
|
|||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
# General tests
|
|
||||||
for test_function in [
|
for test_function in [
|
||||||
test_output_to_stdout,
|
test_output_to_stdout,
|
||||||
test_output_to_file,
|
test_output_to_file,
|
||||||
@@ -211,17 +167,4 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
test_function(tmp_dir, test_vector)
|
test_function(tmp_dir, test_vector)
|
||||||
print("OK")
|
print("OK")
|
||||||
|
|
||||||
# Data URI tests
|
|
||||||
for test_function in [
|
|
||||||
test_output_to_file_with_data_uris,
|
|
||||||
]:
|
|
||||||
for test_vector in DATA_URI_TEST_VECTORS:
|
|
||||||
print(
|
|
||||||
f"Running {test_function.__name__} on {test_vector.filename}...",
|
|
||||||
end="",
|
|
||||||
)
|
|
||||||
test_function(tmp_dir, test_vector)
|
|
||||||
print("OK")
|
|
||||||
|
|
||||||
print("All tests passed!")
|
print("All tests passed!")
|
||||||
|
|||||||
BIN
packages/markitdown/tests/test_files/test.docx
vendored
Executable file → Normal file
BIN
packages/markitdown/tests/test_files/test.docx
vendored
Executable file → Normal file
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.epub
vendored
BIN
packages/markitdown/tests/test_files/test.epub
vendored
Binary file not shown.
@@ -5,8 +5,6 @@ import shutil
|
|||||||
import openai
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from markitdown._uri_utils import parse_data_uri, file_uri_to_path
|
|
||||||
|
|
||||||
from markitdown import (
|
from markitdown import (
|
||||||
MarkItDown,
|
MarkItDown,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
@@ -178,79 +176,6 @@ def test_stream_info_operations() -> None:
|
|||||||
assert updated_stream_info.url == "url.1"
|
assert updated_stream_info.url == "url.1"
|
||||||
|
|
||||||
|
|
||||||
def test_data_uris() -> None:
|
|
||||||
# Test basic parsing of data URIs
|
|
||||||
data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="
|
|
||||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
|
||||||
assert mime_type == "text/plain"
|
|
||||||
assert len(attributes) == 0
|
|
||||||
assert data == b"Hello, World!"
|
|
||||||
|
|
||||||
data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ=="
|
|
||||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
|
||||||
assert mime_type is None
|
|
||||||
assert len(attributes) == 0
|
|
||||||
assert data == b"Hello, World!"
|
|
||||||
|
|
||||||
data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ=="
|
|
||||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
|
||||||
assert mime_type == "text/plain"
|
|
||||||
assert len(attributes) == 1
|
|
||||||
assert attributes["charset"] == "utf-8"
|
|
||||||
assert data == b"Hello, World!"
|
|
||||||
|
|
||||||
data_uri = "data:,Hello%2C%20World%21"
|
|
||||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
|
||||||
assert mime_type is None
|
|
||||||
assert len(attributes) == 0
|
|
||||||
assert data == b"Hello, World!"
|
|
||||||
|
|
||||||
data_uri = "data:text/plain,Hello%2C%20World%21"
|
|
||||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
|
||||||
assert mime_type == "text/plain"
|
|
||||||
assert len(attributes) == 0
|
|
||||||
assert data == b"Hello, World!"
|
|
||||||
|
|
||||||
data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21"
|
|
||||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
|
||||||
assert mime_type == "text/plain"
|
|
||||||
assert len(attributes) == 1
|
|
||||||
assert attributes["charset"] == "utf-8"
|
|
||||||
assert data == b"Hello, World!"
|
|
||||||
|
|
||||||
|
|
||||||
def test_file_uris() -> None:
|
|
||||||
# Test file URI with an empty host
|
|
||||||
file_uri = "file:///path/to/file.txt"
|
|
||||||
netloc, path = file_uri_to_path(file_uri)
|
|
||||||
assert netloc is None
|
|
||||||
assert path == "/path/to/file.txt"
|
|
||||||
|
|
||||||
# Test file URI with no host
|
|
||||||
file_uri = "file:/path/to/file.txt"
|
|
||||||
netloc, path = file_uri_to_path(file_uri)
|
|
||||||
assert netloc is None
|
|
||||||
assert path == "/path/to/file.txt"
|
|
||||||
|
|
||||||
# Test file URI with localhost
|
|
||||||
file_uri = "file://localhost/path/to/file.txt"
|
|
||||||
netloc, path = file_uri_to_path(file_uri)
|
|
||||||
assert netloc == "localhost"
|
|
||||||
assert path == "/path/to/file.txt"
|
|
||||||
|
|
||||||
# Test file URI with query parameters
|
|
||||||
file_uri = "file:///path/to/file.txt?param=value"
|
|
||||||
netloc, path = file_uri_to_path(file_uri)
|
|
||||||
assert netloc is None
|
|
||||||
assert path == "/path/to/file.txt"
|
|
||||||
|
|
||||||
# Test file URI with fragment
|
|
||||||
file_uri = "file:///path/to/file.txt#fragment"
|
|
||||||
netloc, path = file_uri_to_path(file_uri)
|
|
||||||
assert netloc is None
|
|
||||||
assert path == "/path/to/file.txt"
|
|
||||||
|
|
||||||
|
|
||||||
def test_docx_comments() -> None:
|
def test_docx_comments() -> None:
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
@@ -389,8 +314,6 @@ if __name__ == "__main__":
|
|||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
for test in [
|
for test in [
|
||||||
test_stream_info_operations,
|
test_stream_info_operations,
|
||||||
test_data_uris,
|
|
||||||
test_file_uris,
|
|
||||||
test_docx_comments,
|
test_docx_comments,
|
||||||
test_input_as_strings,
|
test_input_as_strings,
|
||||||
test_markitdown_remote,
|
test_markitdown_remote,
|
||||||
|
|||||||
@@ -3,14 +3,12 @@ import os
|
|||||||
import time
|
import time
|
||||||
import pytest
|
import pytest
|
||||||
import codecs
|
import codecs
|
||||||
import base64
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
from _test_vectors import GENERAL_TEST_VECTORS
|
||||||
else:
|
else:
|
||||||
from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
from ._test_vectors import GENERAL_TEST_VECTORS
|
||||||
|
|
||||||
from markitdown import (
|
from markitdown import (
|
||||||
MarkItDown,
|
MarkItDown,
|
||||||
@@ -49,6 +47,7 @@ def test_guess_stream_info(test_vector):
|
|||||||
# mimetype or extension, so we'll special-case them here.
|
# mimetype or extension, so we'll special-case them here.
|
||||||
if test_vector.filename in [
|
if test_vector.filename in [
|
||||||
"test_outlook_msg.msg",
|
"test_outlook_msg.msg",
|
||||||
|
"test_mskanji.csv", # See: https://github.com/google/magika/issues/983
|
||||||
]:
|
]:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -97,6 +96,15 @@ def test_convert_stream_without_hints(test_vector):
|
|||||||
"""Test the conversion of a stream with no stream info."""
|
"""Test the conversion of a stream with no stream info."""
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# For some limited exceptions, we can't guarantee the exact
|
||||||
|
# mimetype or extension, so we'll special-case them here.
|
||||||
|
if test_vector.filename in [
|
||||||
|
# This appears to be a subtle bug in magika.
|
||||||
|
# See: https://github.com/google/magika/issues/983
|
||||||
|
"test_mskanji.csv",
|
||||||
|
]:
|
||||||
|
return
|
||||||
|
|
||||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||||
result = markitdown.convert(stream, url=test_vector.url)
|
result = markitdown.convert(stream, url=test_vector.url)
|
||||||
for string in test_vector.must_include:
|
for string in test_vector.must_include:
|
||||||
@@ -110,8 +118,8 @@ def test_convert_stream_without_hints(test_vector):
|
|||||||
reason="do not run tests that query external urls",
|
reason="do not run tests that query external urls",
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||||
def test_convert_http_uri(test_vector):
|
def test_convert_url(test_vector):
|
||||||
"""Test the conversion of an HTTP:// or HTTPS:// URI."""
|
"""Test the conversion of a stream with no stream info."""
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
time.sleep(1) # Ensure we don't hit rate limits
|
time.sleep(1) # Ensure we don't hit rate limits
|
||||||
@@ -126,96 +134,16 @@ def test_convert_http_uri(test_vector):
|
|||||||
assert string not in result.markdown
|
assert string not in result.markdown
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
||||||
def test_convert_file_uri(test_vector):
|
|
||||||
"""Test the conversion of a file:// URI."""
|
|
||||||
markitdown = MarkItDown()
|
|
||||||
|
|
||||||
result = markitdown.convert(
|
|
||||||
Path(os.path.join(TEST_FILES_DIR, test_vector.filename)).as_uri(),
|
|
||||||
url=test_vector.url,
|
|
||||||
)
|
|
||||||
for string in test_vector.must_include:
|
|
||||||
assert string in result.markdown
|
|
||||||
for string in test_vector.must_not_include:
|
|
||||||
assert string not in result.markdown
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
||||||
def test_convert_data_uri(test_vector):
|
|
||||||
"""Test the conversion of a data URI."""
|
|
||||||
markitdown = MarkItDown()
|
|
||||||
|
|
||||||
data = ""
|
|
||||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
|
||||||
data = base64.b64encode(stream.read()).decode("utf-8")
|
|
||||||
mimetype = test_vector.mimetype
|
|
||||||
data_uri = f"data:{mimetype};base64,{data}"
|
|
||||||
|
|
||||||
result = markitdown.convert(
|
|
||||||
data_uri,
|
|
||||||
url=test_vector.url,
|
|
||||||
)
|
|
||||||
for string in test_vector.must_include:
|
|
||||||
assert string in result.markdown
|
|
||||||
for string in test_vector.must_not_include:
|
|
||||||
assert string not in result.markdown
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
|
||||||
def test_convert_keep_data_uris(test_vector):
|
|
||||||
"""Test API functionality when keep_data_uris is enabled"""
|
|
||||||
markitdown = MarkItDown()
|
|
||||||
|
|
||||||
# Test local file conversion
|
|
||||||
result = markitdown.convert(
|
|
||||||
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
|
||||||
keep_data_uris=True,
|
|
||||||
url=test_vector.url,
|
|
||||||
)
|
|
||||||
|
|
||||||
for string in test_vector.must_include:
|
|
||||||
assert string in result.markdown
|
|
||||||
for string in test_vector.must_not_include:
|
|
||||||
assert string not in result.markdown
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
|
||||||
def test_convert_stream_keep_data_uris(test_vector):
|
|
||||||
"""Test the conversion of a stream with no stream info."""
|
|
||||||
markitdown = MarkItDown()
|
|
||||||
|
|
||||||
stream_info = StreamInfo(
|
|
||||||
extension=os.path.splitext(test_vector.filename)[1],
|
|
||||||
mimetype=test_vector.mimetype,
|
|
||||||
charset=test_vector.charset,
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
|
||||||
result = markitdown.convert(
|
|
||||||
stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
|
|
||||||
)
|
|
||||||
|
|
||||||
for string in test_vector.must_include:
|
|
||||||
assert string in result.markdown
|
|
||||||
for string in test_vector.must_not_include:
|
|
||||||
assert string not in result.markdown
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
|
|
||||||
# General tests
|
|
||||||
for test_function in [
|
for test_function in [
|
||||||
test_guess_stream_info,
|
test_guess_stream_info,
|
||||||
test_convert_local,
|
test_convert_local,
|
||||||
test_convert_stream_with_hints,
|
test_convert_stream_with_hints,
|
||||||
test_convert_stream_without_hints,
|
test_convert_stream_without_hints,
|
||||||
test_convert_http_uri,
|
test_convert_url,
|
||||||
test_convert_file_uri,
|
|
||||||
test_convert_data_uri,
|
|
||||||
]:
|
]:
|
||||||
for test_vector in GENERAL_TEST_VECTORS:
|
for test_vector in GENERAL_TEST_VECTORS:
|
||||||
print(
|
print(
|
||||||
@@ -223,17 +151,4 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
test_function(test_vector)
|
test_function(test_vector)
|
||||||
print("OK")
|
print("OK")
|
||||||
|
|
||||||
# Data URI tests
|
|
||||||
for test_function in [
|
|
||||||
test_convert_keep_data_uris,
|
|
||||||
test_convert_stream_keep_data_uris,
|
|
||||||
]:
|
|
||||||
for test_vector in DATA_URI_TEST_VECTORS:
|
|
||||||
print(
|
|
||||||
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
|
|
||||||
)
|
|
||||||
test_function(test_vector)
|
|
||||||
print("OK")
|
|
||||||
|
|
||||||
print("All tests passed!")
|
print("All tests passed!")
|
||||||
|
|||||||
Reference in New Issue
Block a user