Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
de2c56ffbc |
@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
|
||||
At present, MarkItDown supports:
|
||||
|
||||
- PDF
|
||||
- PowerPoint
|
||||
- PowerPoint (reading in top-to-bottom, left-to-right order)
|
||||
- Word
|
||||
- Excel
|
||||
- Images (EXIF metadata and OCR)
|
||||
@@ -23,7 +23,6 @@ At present, MarkItDown supports:
|
||||
- Text-based formats (CSV, JSON, XML)
|
||||
- ZIP files (iterates over contents)
|
||||
- Youtube URLs
|
||||
- EPubs
|
||||
- ... and more!
|
||||
|
||||
## Why Markdown?
|
||||
|
||||
@@ -27,7 +27,7 @@ dependencies = [
|
||||
"beautifulsoup4",
|
||||
"requests",
|
||||
"markdownify",
|
||||
"magika~=0.6.1",
|
||||
"magika>=0.6.0rc1",
|
||||
"charset-normalizer",
|
||||
]
|
||||
|
||||
@@ -42,7 +42,7 @@ all = [
|
||||
"olefile",
|
||||
"pydub",
|
||||
"SpeechRecognition",
|
||||
"youtube-transcript-api~=1.0.0",
|
||||
"youtube-transcript-api",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity"
|
||||
]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.0a5"
|
||||
__version__ = "0.1.0a2"
|
||||
|
||||
@@ -139,7 +139,7 @@ def main():
|
||||
else:
|
||||
charset_hint = None
|
||||
|
||||
stream_info = None
|
||||
stream_info: str | None = None
|
||||
if (
|
||||
extension_hint is not None
|
||||
or mime_type_hint is not None
|
||||
|
||||
@@ -38,7 +38,6 @@ from .converters import (
|
||||
AudioConverter,
|
||||
OutlookMsgConverter,
|
||||
ZipConverter,
|
||||
EpubConverter,
|
||||
DocumentIntelligenceConverter,
|
||||
)
|
||||
|
||||
@@ -192,7 +191,6 @@ class MarkItDown:
|
||||
self.register_converter(IpynbConverter())
|
||||
self.register_converter(PdfConverter())
|
||||
self.register_converter(OutlookMsgConverter())
|
||||
self.register_converter(EpubConverter())
|
||||
|
||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||
docintel_endpoint = kwargs.get("docintel_endpoint")
|
||||
@@ -612,16 +610,14 @@ class MarkItDown:
|
||||
# Call magika to guess from the stream
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
result = self._magika.identify_stream(file_stream)
|
||||
stream_bytes = file_stream.read()
|
||||
|
||||
result = self._magika.identify_bytes(stream_bytes)
|
||||
if result.status == "ok" and result.prediction.output.label != "unknown":
|
||||
# If it's text, also guess the charset
|
||||
charset = None
|
||||
if result.prediction.output.is_text:
|
||||
# Read the first 4k to guess the charset
|
||||
file_stream.seek(cur_pos)
|
||||
stream_page = file_stream.read(4096)
|
||||
charset_result = charset_normalizer.from_bytes(stream_page).best()
|
||||
|
||||
charset_result = charset_normalizer.from_bytes(stream_bytes).best()
|
||||
if charset_result is not None:
|
||||
charset = self._normalize_charset(charset_result.encoding)
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ from ._audio_converter import AudioConverter
|
||||
from ._outlook_msg_converter import OutlookMsgConverter
|
||||
from ._zip_converter import ZipConverter
|
||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||
from ._epub_converter import EpubConverter
|
||||
|
||||
__all__ = [
|
||||
"PlainTextConverter",
|
||||
@@ -38,5 +37,4 @@ __all__ = [
|
||||
"OutlookMsgConverter",
|
||||
"ZipConverter",
|
||||
"DocumentIntelligenceConverter",
|
||||
"EpubConverter",
|
||||
]
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import io
|
||||
import re
|
||||
import base64
|
||||
import binascii
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
from typing import Any, BinaryIO, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -61,8 +60,6 @@ class BingSerpConverter(DocumentConverter):
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
assert stream_info.url is not None
|
||||
|
||||
# Parse the query parameters
|
||||
parsed_params = parse_qs(urlparse(stream_info.url).query)
|
||||
query = parsed_params.get("q", [""])[0]
|
||||
@@ -82,9 +79,6 @@ class BingSerpConverter(DocumentConverter):
|
||||
_markdownify = _CustomMarkdownify()
|
||||
results = list()
|
||||
for result in soup.find_all(class_="b_algo"):
|
||||
if not hasattr(result, "find_all"):
|
||||
continue
|
||||
|
||||
# Rewrite redirect urls
|
||||
for a in result.find_all("a", href=True):
|
||||
parsed_href = urlparse(a["href"])
|
||||
|
||||
@@ -1,147 +0,0 @@
|
||||
import os
|
||||
import zipfile
|
||||
import xml.dom.minidom as minidom
|
||||
|
||||
from typing import BinaryIO, Any, Dict, List
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/epub",
|
||||
"application/epub+zip",
|
||||
"application/x-epub+zip",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".epub"]
|
||||
|
||||
MIME_TYPE_MAPPING = {
|
||||
".html": "text/html",
|
||||
".xhtml": "application/xhtml+xml",
|
||||
}
|
||||
|
||||
|
||||
class EpubConverter(HtmlConverter):
|
||||
"""
|
||||
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
with zipfile.ZipFile(file_stream, "r") as z:
|
||||
# Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
|
||||
|
||||
# Locate content.opf
|
||||
container_dom = minidom.parse(z.open("META-INF/container.xml"))
|
||||
opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
|
||||
"full-path"
|
||||
)
|
||||
|
||||
# Parse content.opf
|
||||
opf_dom = minidom.parse(z.open(opf_path))
|
||||
metadata: Dict[str, Any] = {
|
||||
"title": self._get_text_from_node(opf_dom, "dc:title"),
|
||||
"authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
|
||||
"language": self._get_text_from_node(opf_dom, "dc:language"),
|
||||
"publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
|
||||
"date": self._get_text_from_node(opf_dom, "dc:date"),
|
||||
"description": self._get_text_from_node(opf_dom, "dc:description"),
|
||||
"identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
|
||||
}
|
||||
|
||||
# Extract manifest items (ID → href mapping)
|
||||
manifest = {
|
||||
item.getAttribute("id"): item.getAttribute("href")
|
||||
for item in opf_dom.getElementsByTagName("item")
|
||||
}
|
||||
|
||||
# Extract spine order (ID refs)
|
||||
spine_items = opf_dom.getElementsByTagName("itemref")
|
||||
spine_order = [item.getAttribute("idref") for item in spine_items]
|
||||
|
||||
# Convert spine order to actual file paths
|
||||
base_path = "/".join(
|
||||
opf_path.split("/")[:-1]
|
||||
) # Get base directory of content.opf
|
||||
spine = [
|
||||
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
|
||||
for item_id in spine_order
|
||||
if item_id in manifest
|
||||
]
|
||||
|
||||
# Extract and convert the content
|
||||
markdown_content: List[str] = []
|
||||
for file in spine:
|
||||
if file in z.namelist():
|
||||
with z.open(file) as f:
|
||||
filename = os.path.basename(file)
|
||||
extension = os.path.splitext(filename)[1].lower()
|
||||
mimetype = MIME_TYPE_MAPPING.get(extension)
|
||||
converted_content = self._html_converter.convert(
|
||||
f,
|
||||
StreamInfo(
|
||||
mimetype=mimetype,
|
||||
extension=extension,
|
||||
filename=filename,
|
||||
),
|
||||
)
|
||||
markdown_content.append(converted_content.markdown.strip())
|
||||
|
||||
# Format and add the metadata
|
||||
metadata_markdown = []
|
||||
for key, value in metadata.items():
|
||||
if isinstance(value, list):
|
||||
value = ", ".join(value)
|
||||
if value:
|
||||
metadata_markdown.append(f"**{key.capitalize()}:** {value}")
|
||||
|
||||
markdown_content.insert(0, "\n".join(metadata_markdown))
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown="\n\n".join(markdown_content), title=metadata["title"]
|
||||
)
|
||||
|
||||
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
|
||||
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
|
||||
texts = self._get_all_texts_from_nodes(dom, tag_name)
|
||||
if len(texts) > 0:
|
||||
return texts[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
def _get_all_texts_from_nodes(
|
||||
self, dom: minidom.Document, tag_name: str
|
||||
) -> List[str]:
|
||||
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
|
||||
texts: List[str] = []
|
||||
for node in dom.getElementsByTagName(tag_name):
|
||||
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
|
||||
texts.append(node.firstChild.nodeValue.strip())
|
||||
return texts
|
||||
@@ -9,7 +9,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
_dependency_exc_info = None
|
||||
olefile = None
|
||||
try:
|
||||
import olefile # type: ignore[no-redef]
|
||||
import olefile
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
@@ -56,13 +56,12 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
|
||||
# Brue force, check if it's an Outlook file
|
||||
try:
|
||||
if olefile is not None:
|
||||
msg = olefile.OleFileIO(file_stream)
|
||||
toc = "\n".join([str(stream) for stream in msg.listdir()])
|
||||
return (
|
||||
"__properties_version1.0" in toc
|
||||
and "__recip_version1.0_#00000000" in toc
|
||||
)
|
||||
msg = olefile.OleFileIO(file_stream)
|
||||
toc = "\n".join([str(stream) for stream in msg.listdir()])
|
||||
return (
|
||||
"__properties_version1.0" in toc
|
||||
and "__recip_version1.0_#00000000" in toc
|
||||
)
|
||||
except Exception as e:
|
||||
pass
|
||||
finally:
|
||||
@@ -90,11 +89,7 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
assert (
|
||||
olefile is not None
|
||||
) # If we made it this far, olefile should be available
|
||||
msg = olefile.OleFileIO(file_stream)
|
||||
|
||||
# Extract email metadata
|
||||
md_content = "# Email Message\n\n"
|
||||
|
||||
@@ -126,7 +121,6 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
|
||||
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||
assert olefile is not None
|
||||
assert isinstance(
|
||||
msg, olefile.OleFileIO
|
||||
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
|
||||
|
||||
@@ -17,16 +17,12 @@ except ImportError:
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/",
|
||||
"application/json",
|
||||
"application/markdown",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".txt",
|
||||
".text",
|
||||
".md",
|
||||
".markdown",
|
||||
".json",
|
||||
".jsonl",
|
||||
# Mimetypes to ignore (commonly confused extensions)
|
||||
IGNORE_MIME_TYPE_PREFIXES = [
|
||||
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
|
||||
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
|
||||
]
|
||||
|
||||
|
||||
@@ -42,14 +38,9 @@ class PlainTextConverter(DocumentConverter):
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
# If we have a charset, we can safely assume it's text
|
||||
# With Magika in the earlier stages, this handles most cases
|
||||
if stream_info.charset is not None:
|
||||
return True
|
||||
|
||||
# Otherwise, check the mimetype and extension
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
for prefix in IGNORE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return False
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
|
||||
@@ -66,7 +66,7 @@ class RssConverter(DocumentConverter):
|
||||
file_stream.seek(cur_pos)
|
||||
return False
|
||||
|
||||
def _feed_type(self, doc: Any) -> str | None:
|
||||
def _feed_type(self, doc: Any) -> str:
|
||||
if doc.getElementsByTagName("rss"):
|
||||
return "rss"
|
||||
elif doc.getElementsByTagName("feed"):
|
||||
@@ -130,10 +130,10 @@ class RssConverter(DocumentConverter):
|
||||
Returns None if the feed type is not recognized or something goes wrong.
|
||||
"""
|
||||
root = doc.getElementsByTagName("rss")[0]
|
||||
channel_list = root.getElementsByTagName("channel")
|
||||
if not channel_list:
|
||||
raise ValueError("No channel found in RSS feed")
|
||||
channel = channel_list[0]
|
||||
channel = root.getElementsByTagName("channel")
|
||||
if not channel:
|
||||
return None
|
||||
channel = channel[0]
|
||||
channel_title = self._get_data_by_tag_name(channel, "title")
|
||||
channel_description = self._get_data_by_tag_name(channel, "description")
|
||||
items = channel.getElementsByTagName("item")
|
||||
@@ -141,6 +141,8 @@ class RssConverter(DocumentConverter):
|
||||
md_text = f"# {channel_title}\n"
|
||||
if channel_description:
|
||||
md_text += f"{channel_description}\n"
|
||||
if not items:
|
||||
items = []
|
||||
for item in items:
|
||||
title = self._get_data_by_tag_name(item, "title")
|
||||
description = self._get_data_by_tag_name(item, "description")
|
||||
@@ -181,6 +183,5 @@ class RssConverter(DocumentConverter):
|
||||
return None
|
||||
fc = nodes[0].firstChild
|
||||
if fc:
|
||||
if hasattr(fc, "data"):
|
||||
return fc.data
|
||||
return fc.data
|
||||
return None
|
||||
|
||||
@@ -7,14 +7,8 @@ from .._exceptions import MissingDependencyException
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
# Suppress some warnings on library import
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||
import speech_recognition as sr
|
||||
import pydub
|
||||
import speech_recognition as sr
|
||||
import pydub
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import io
|
||||
import re
|
||||
import bs4
|
||||
from typing import Any, BinaryIO, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
@@ -57,7 +57,7 @@ class WikipediaConverter(DocumentConverter):
|
||||
) -> DocumentConverterResult:
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Remove javascript and style blocks
|
||||
for script in soup(["script", "style"]):
|
||||
@@ -72,8 +72,9 @@ class WikipediaConverter(DocumentConverter):
|
||||
|
||||
if body_elm:
|
||||
# What's the title
|
||||
if title_elm and isinstance(title_elm, bs4.Tag):
|
||||
main_title = title_elm.string
|
||||
if title_elm and len(title_elm) > 0:
|
||||
main_title = title_elm.string # type: ignore
|
||||
assert isinstance(main_title, str)
|
||||
|
||||
# Convert the page
|
||||
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
|
||||
|
||||
@@ -3,22 +3,17 @@ import json
|
||||
import time
|
||||
import io
|
||||
import re
|
||||
import bs4
|
||||
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
||||
from urllib.parse import parse_qs, urlparse, unquote
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
# Optional YouTube transcription support
|
||||
try:
|
||||
# Suppress some warnings on library import
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||
# Patch submitted upstream to fix the SyntaxWarning
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
@@ -77,31 +72,21 @@ class YouTubeConverter(DocumentConverter):
|
||||
) -> DocumentConverterResult:
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Read the meta tags
|
||||
metadata: Dict[str, str] = {}
|
||||
|
||||
if soup.title and soup.title.string:
|
||||
metadata["title"] = soup.title.string
|
||||
|
||||
metadata: Dict[str, str] = {"title": soup.title.string}
|
||||
for meta in soup(["meta"]):
|
||||
if not isinstance(meta, bs4.Tag):
|
||||
continue
|
||||
|
||||
for a in meta.attrs:
|
||||
if a in ["itemprop", "property", "name"]:
|
||||
key = str(meta.get(a, ""))
|
||||
content = str(meta.get("content", ""))
|
||||
if key and content: # Only add non-empty content
|
||||
metadata[key] = content
|
||||
content = meta.get("content", "")
|
||||
if content: # Only add non-empty content
|
||||
metadata[meta[a]] = content
|
||||
break
|
||||
|
||||
# Try reading the description
|
||||
try:
|
||||
for script in soup(["script"]):
|
||||
if not isinstance(script, bs4.Tag):
|
||||
continue
|
||||
if not script.string: # Skip empty scripts
|
||||
continue
|
||||
content = script.string
|
||||
@@ -147,7 +132,6 @@ class YouTubeConverter(DocumentConverter):
|
||||
webpage_text += f"\n### Description\n{description}\n"
|
||||
|
||||
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
||||
ytt_api = YouTubeTranscriptApi()
|
||||
transcript_text = ""
|
||||
parsed_url = urlparse(stream_info.url) # type: ignore
|
||||
params = parse_qs(parsed_url.query) # type: ignore
|
||||
@@ -159,7 +143,7 @@ class YouTubeConverter(DocumentConverter):
|
||||
)
|
||||
# Retry the transcript fetching operation
|
||||
transcript = self._retry_operation(
|
||||
lambda: ytt_api.fetch(
|
||||
lambda: YouTubeTranscriptApi.get_transcript(
|
||||
video_id, languages=youtube_transcript_languages
|
||||
),
|
||||
retries=3, # Retry 3 times
|
||||
@@ -167,14 +151,17 @@ class YouTubeConverter(DocumentConverter):
|
||||
)
|
||||
if transcript:
|
||||
transcript_text = " ".join(
|
||||
[part.text for part in transcript]
|
||||
[part["text"] for part in transcript]
|
||||
) # type: ignore
|
||||
# Alternative formatting:
|
||||
# formatter = TextFormatter()
|
||||
# formatter.format_transcript(transcript)
|
||||
except Exception as e:
|
||||
print(f"Error fetching transcript: {e}")
|
||||
if transcript_text:
|
||||
webpage_text += f"\n### Transcript\n{transcript_text}\n"
|
||||
|
||||
title = title if title else (soup.title.string if soup.title else "")
|
||||
title = title if title else soup.title.string
|
||||
assert isinstance(title, str)
|
||||
|
||||
return DocumentConverterResult(
|
||||
|
||||
@@ -211,22 +211,4 @@ GENERAL_TEST_VECTORS = [
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.epub",
|
||||
mimetype="application/epub+zip",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"**Authors:** Test Author",
|
||||
"A test EPUB document for MarkItDown testing",
|
||||
"# Chapter 1: Test Content",
|
||||
"This is a **test** paragraph with some formatting",
|
||||
"* A bullet point",
|
||||
"* Another point",
|
||||
"# Chapter 2: More Content",
|
||||
"*different* style",
|
||||
"> This is a blockquote for testing",
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -114,9 +114,7 @@ def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
|
||||
)
|
||||
|
||||
stdout = result.stdout.decode(locale.getpreferredencoding())
|
||||
assert (
|
||||
result.returncode == 0
|
||||
), f"CLI exited with error: {result.stderr.decode('utf-8')}"
|
||||
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
||||
for test_string in test_vector.must_include:
|
||||
assert test_string in stdout
|
||||
for test_string in test_vector.must_not_include:
|
||||
|
||||
BIN
packages/markitdown/tests/test_files/test.epub
vendored
BIN
packages/markitdown/tests/test_files/test.epub
vendored
Binary file not shown.
@@ -47,6 +47,7 @@ def test_guess_stream_info(test_vector):
|
||||
# mimetype or extension, so we'll special-case them here.
|
||||
if test_vector.filename in [
|
||||
"test_outlook_msg.msg",
|
||||
"test_mskanji.csv", # See: https://github.com/google/magika/issues/983
|
||||
]:
|
||||
return
|
||||
|
||||
@@ -95,6 +96,15 @@ def test_convert_stream_without_hints(test_vector):
|
||||
"""Test the conversion of a stream with no stream info."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# For some limited exceptions, we can't guarantee the exact
|
||||
# mimetype or extension, so we'll special-case them here.
|
||||
if test_vector.filename in [
|
||||
# This appears to be a subtle bug in magika.
|
||||
# See: https://github.com/google/magika/issues/983
|
||||
"test_mskanji.csv",
|
||||
]:
|
||||
return
|
||||
|
||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||
result = markitdown.convert(stream, url=test_vector.url)
|
||||
for string in test_vector.must_include:
|
||||
|
||||
Reference in New Issue
Block a user