Compare commits
18 Commits
zip_format
...
kennyzhang
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4e0a10ecf3 | ||
|
|
950b135da6 | ||
|
|
b671345bb9 | ||
|
|
d9a92f7f06 | ||
|
|
db0c8acbaf | ||
|
|
08330c2ac3 | ||
|
|
4afc1fe886 | ||
|
|
b0044720da | ||
|
|
07a28d4f00 | ||
|
|
b8b3897952 | ||
|
|
395ce2d301 | ||
|
|
808401a331 | ||
|
|
e75f3f6f5b | ||
|
|
8e950325d2 | ||
|
|
096fef3d5f | ||
|
|
52cbff061a | ||
|
|
0027e6d425 | ||
|
|
63a7bafadd |
19
README.md
19
README.md
@@ -97,6 +97,25 @@ result = md.convert("test.pdf")
|
|||||||
print(result.text_content)
|
print(result.text_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
MarkItDown also supports converting file objects directly:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from markitdown import MarkItDown
|
||||||
|
|
||||||
|
md = MarkItDown()
|
||||||
|
|
||||||
|
# Providing the file extension when converting via file objects is recommended for most consistent results
|
||||||
|
# Binary Mode
|
||||||
|
with open("test.docx", 'rb') as file:
|
||||||
|
result = md.convert(file, file_extension=".docx")
|
||||||
|
print(result.text_content)
|
||||||
|
|
||||||
|
# Non-Binary Mode
|
||||||
|
with open("sample.ipynb", 'rt', encoding="utf-8") as file:
|
||||||
|
result = md.convert(file, file_extension=".ipynb")
|
||||||
|
print(result.text_content)
|
||||||
|
```
|
||||||
|
|
||||||
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from typing import Any, List, Optional, Union
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
|
from io import BufferedIOBase, TextIOBase, BytesIO
|
||||||
|
|
||||||
# File-format detection
|
# File-format detection
|
||||||
import puremagic
|
import puremagic
|
||||||
@@ -36,6 +37,7 @@ from .converters import (
|
|||||||
OutlookMsgConverter,
|
OutlookMsgConverter,
|
||||||
ZipConverter,
|
ZipConverter,
|
||||||
DocumentIntelligenceConverter,
|
DocumentIntelligenceConverter,
|
||||||
|
ConverterInput,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
@@ -173,14 +175,15 @@ class MarkItDown:
|
|||||||
warn("Plugins converters are already enabled.", RuntimeWarning)
|
warn("Plugins converters are already enabled.", RuntimeWarning)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
self,
|
||||||
|
source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase],
|
||||||
|
**kwargs: Any,
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
|
- source: can be a string representing a path either as string pathlib path object or url, a requests.response object, or a file object (TextIO or BinaryIO)
|
||||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Local path or url
|
# Local path or url
|
||||||
if isinstance(source, str):
|
if isinstance(source, str):
|
||||||
if (
|
if (
|
||||||
@@ -196,6 +199,9 @@ class MarkItDown:
|
|||||||
return self.convert_response(source, **kwargs)
|
return self.convert_response(source, **kwargs)
|
||||||
elif isinstance(source, Path):
|
elif isinstance(source, Path):
|
||||||
return self.convert_local(source, **kwargs)
|
return self.convert_local(source, **kwargs)
|
||||||
|
# File object
|
||||||
|
elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase):
|
||||||
|
return self.convert_file_object(source, **kwargs)
|
||||||
|
|
||||||
def convert_local(
|
def convert_local(
|
||||||
self, path: Union[str, Path], **kwargs: Any
|
self, path: Union[str, Path], **kwargs: Any
|
||||||
@@ -210,11 +216,33 @@ class MarkItDown:
|
|||||||
base, ext = os.path.splitext(path)
|
base, ext = os.path.splitext(path)
|
||||||
self._append_ext(extensions, ext)
|
self._append_ext(extensions, ext)
|
||||||
|
|
||||||
for g in self._guess_ext_magic(path):
|
for g in self._guess_ext_magic(source=path):
|
||||||
self._append_ext(extensions, g)
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
|
# Create the ConverterInput object
|
||||||
|
input = ConverterInput(input_type="filepath", filepath=path)
|
||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
return self._convert(path, extensions, **kwargs)
|
return self._convert(input, extensions, **kwargs)
|
||||||
|
|
||||||
|
def convert_file_object(
|
||||||
|
self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
|
||||||
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
|
# Prepare a list of extensions to try (in order of priority
|
||||||
|
ext = kwargs.get("file_extension")
|
||||||
|
extensions = [ext] if ext is not None else []
|
||||||
|
|
||||||
|
# TODO: Curently, there are some ongoing issues with passing direct file objects to puremagic (incorrect guesses, unsupported file type errors, etc.)
|
||||||
|
# Only use puremagic as a last resort if no extensions were provided
|
||||||
|
if extensions == []:
|
||||||
|
for g in self._guess_ext_magic(source=file_object):
|
||||||
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
|
# Create the ConverterInput object
|
||||||
|
input = ConverterInput(input_type="object", file_object=file_object)
|
||||||
|
|
||||||
|
# Convert
|
||||||
|
return self._convert(input, extensions, **kwargs)
|
||||||
|
|
||||||
# TODO what should stream's type be?
|
# TODO what should stream's type be?
|
||||||
def convert_stream(
|
def convert_stream(
|
||||||
@@ -238,11 +266,14 @@ class MarkItDown:
|
|||||||
fh.close()
|
fh.close()
|
||||||
|
|
||||||
# Use puremagic to check for more extension options
|
# Use puremagic to check for more extension options
|
||||||
for g in self._guess_ext_magic(temp_path):
|
for g in self._guess_ext_magic(source=temp_path):
|
||||||
self._append_ext(extensions, g)
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
|
# Create the ConverterInput object
|
||||||
|
input = ConverterInput(input_type="filepath", filepath=temp_path)
|
||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
result = self._convert(temp_path, extensions, **kwargs)
|
result = self._convert(input, extensions, **kwargs)
|
||||||
# Clean up
|
# Clean up
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
@@ -294,11 +325,14 @@ class MarkItDown:
|
|||||||
fh.close()
|
fh.close()
|
||||||
|
|
||||||
# Use puremagic to check for more extension options
|
# Use puremagic to check for more extension options
|
||||||
for g in self._guess_ext_magic(temp_path):
|
for g in self._guess_ext_magic(source=temp_path):
|
||||||
self._append_ext(extensions, g)
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
|
# Create the ConverterInput object
|
||||||
|
input = ConverterInput(input_type="filepath", filepath=temp_path)
|
||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
|
result = self._convert(input, extensions, url=response.url, **kwargs)
|
||||||
# Clean up
|
# Clean up
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
@@ -310,10 +344,9 @@ class MarkItDown:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def _convert(
|
def _convert(
|
||||||
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
self, input: ConverterInput, extensions: List[Union[str, None]], **kwargs
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
error_trace = ""
|
error_trace = ""
|
||||||
|
|
||||||
# Create a copy of the page_converters list, sorted by priority.
|
# Create a copy of the page_converters list, sorted by priority.
|
||||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||||
@@ -348,7 +381,7 @@ class MarkItDown:
|
|||||||
|
|
||||||
# If we hit an error log it and keep trying
|
# If we hit an error log it and keep trying
|
||||||
try:
|
try:
|
||||||
res = converter.convert(local_path, **_kwargs)
|
res = converter.convert(input, **_kwargs)
|
||||||
except Exception:
|
except Exception:
|
||||||
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
||||||
|
|
||||||
@@ -365,12 +398,12 @@ class MarkItDown:
|
|||||||
# If we got this far without success, report any exceptions
|
# If we got this far without success, report any exceptions
|
||||||
if len(error_trace) > 0:
|
if len(error_trace) > 0:
|
||||||
raise FileConversionException(
|
raise FileConversionException(
|
||||||
f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
|
f"Could not convert '{input.filepath}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Nothing can handle it!
|
# Nothing can handle it!
|
||||||
raise UnsupportedFormatException(
|
raise UnsupportedFormatException(
|
||||||
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
|
f"Could not convert '{input.filepath}' to Markdown. The formats {extensions} are not supported."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _append_ext(self, extensions, ext):
|
def _append_ext(self, extensions, ext):
|
||||||
@@ -383,29 +416,38 @@ class MarkItDown:
|
|||||||
# if ext not in extensions:
|
# if ext not in extensions:
|
||||||
extensions.append(ext)
|
extensions.append(ext)
|
||||||
|
|
||||||
def _guess_ext_magic(self, path):
|
def _guess_ext_magic(self, source):
|
||||||
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
||||||
# Use puremagic to guess
|
# Use puremagic to guess
|
||||||
try:
|
try:
|
||||||
guesses = puremagic.magic_file(path)
|
guesses = []
|
||||||
|
|
||||||
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
# Guess extensions for filepaths
|
||||||
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
if isinstance(source, str):
|
||||||
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
guesses = puremagic.magic_file(source)
|
||||||
# (space, tab, newline, carriage return, vertical tab, form feed).
|
|
||||||
if len(guesses) == 0:
|
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
||||||
with open(path, "rb") as file:
|
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
||||||
while True:
|
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
||||||
char = file.read(1)
|
# (space, tab, newline, carriage return, vertical tab, form feed).
|
||||||
if not char: # End of file
|
if len(guesses) == 0:
|
||||||
break
|
with open(source, "rb") as file:
|
||||||
if not char.isspace():
|
while True:
|
||||||
file.seek(file.tell() - 1)
|
char = file.read(1)
|
||||||
break
|
if not char: # End of file
|
||||||
try:
|
break
|
||||||
guesses = puremagic.magic_stream(file)
|
if not char.isspace():
|
||||||
except puremagic.main.PureError:
|
file.seek(file.tell() - 1)
|
||||||
pass
|
break
|
||||||
|
try:
|
||||||
|
guesses = puremagic.magic_stream(file)
|
||||||
|
except puremagic.main.PureError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source
|
||||||
|
# TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not work)
|
||||||
|
elif isinstance(source, BufferedIOBase):
|
||||||
|
guesses = puremagic.magic_stream(source)
|
||||||
|
|
||||||
extensions = list()
|
extensions = list()
|
||||||
for g in guesses:
|
for g in guesses:
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ from ._mp3_converter import Mp3Converter
|
|||||||
from ._outlook_msg_converter import OutlookMsgConverter
|
from ._outlook_msg_converter import OutlookMsgConverter
|
||||||
from ._zip_converter import ZipConverter
|
from ._zip_converter import ZipConverter
|
||||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"DocumentConverter",
|
"DocumentConverter",
|
||||||
@@ -42,4 +43,5 @@ __all__ = [
|
|||||||
"OutlookMsgConverter",
|
"OutlookMsgConverter",
|
||||||
"ZipConverter",
|
"ZipConverter",
|
||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
|
"ConverterInput",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class BingSerpConverter(DocumentConverter):
|
class BingSerpConverter(DocumentConverter):
|
||||||
@@ -21,7 +22,9 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a Bing SERP
|
# Bail if not a Bing SERP
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
if extension.lower() not in [".html", ".htm"]:
|
||||||
@@ -36,8 +39,9 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Parse the file
|
# Parse the file
|
||||||
soup = None
|
soup = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
# Clean up some formatting
|
# Clean up some formatting
|
||||||
for tptt in soup.find_all(class_="tptt"):
|
for tptt in soup.find_all(class_="tptt"):
|
||||||
|
|||||||
@@ -0,0 +1,30 @@
|
|||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
|
||||||
|
class ConverterInput:
|
||||||
|
"""
|
||||||
|
Wrapper for inputs to converter functions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_type: str = "filepath",
|
||||||
|
filepath: Union[str, None] = None,
|
||||||
|
file_object: Union[Any, None] = None,
|
||||||
|
):
|
||||||
|
if input_type not in ["filepath", "object"]:
|
||||||
|
raise ValueError(f"Invalid converter input type: {input_type}")
|
||||||
|
|
||||||
|
self.input_type = input_type
|
||||||
|
self.filepath = filepath
|
||||||
|
self.file_object = file_object
|
||||||
|
|
||||||
|
def read_file(
|
||||||
|
self,
|
||||||
|
mode: str = "rb",
|
||||||
|
encoding: Union[str, None] = None,
|
||||||
|
) -> Any:
|
||||||
|
if self.input_type == "object":
|
||||||
|
return self.file_object
|
||||||
|
|
||||||
|
return open(self.filepath, mode=mode, encoding=encoding)
|
||||||
@@ -11,6 +11,7 @@ from azure.ai.documentintelligence.models import (
|
|||||||
from azure.identity import DefaultAzureCredential
|
from azure.identity import DefaultAzureCredential
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||||
@@ -37,10 +38,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
api_version=self.api_version,
|
api_version=self.api_version,
|
||||||
credential=DefaultAzureCredential(),
|
credential=DefaultAzureCredential(),
|
||||||
)
|
)
|
||||||
self._priority = priority
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if extension is not supported by Document Intelligence
|
# Bail if extension is not supported by Document Intelligence
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -60,9 +60,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
if extension.lower() not in docintel_extensions:
|
if extension.lower() not in docintel_extensions:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Get the bytestring for the local path
|
# Get the bytestring from the converter input
|
||||||
with open(local_path, "rb") as f:
|
file_obj = input.read_file(mode="rb")
|
||||||
file_bytes = f.read()
|
file_bytes = file_obj.read()
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
|
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
|
||||||
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
|
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from ._base import (
|
|||||||
|
|
||||||
from ._base import DocumentConverter
|
from ._base import DocumentConverter
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class DocxConverter(HtmlConverter):
|
class DocxConverter(HtmlConverter):
|
||||||
@@ -20,18 +21,20 @@ class DocxConverter(HtmlConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a DOCX
|
# Bail if not a DOCX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".docx":
|
if extension.lower() != ".docx":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
result = None
|
result = None
|
||||||
with open(local_path, "rb") as docx_file:
|
style_map = kwargs.get("style_map", None)
|
||||||
style_map = kwargs.get("style_map", None)
|
file_obj = input.read_file(mode="rb")
|
||||||
|
result = mammoth.convert_to_html(file_obj, style_map=style_map)
|
||||||
result = mammoth.convert_to_html(docx_file, style_map=style_map)
|
file_obj.close()
|
||||||
html_content = result.value
|
html_content = result.value
|
||||||
result = self._convert(html_content)
|
result = self._convert(html_content)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class HtmlConverter(DocumentConverter):
|
class HtmlConverter(DocumentConverter):
|
||||||
@@ -14,7 +15,7 @@ class HtmlConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not html
|
# Bail if not html
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -22,8 +23,9 @@ class HtmlConverter(DocumentConverter):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
result = None
|
result = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
result = self._convert(fh.read())
|
result = self._convert(file_obj.read())
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class ImageConverter(MediaConverter):
|
class ImageConverter(MediaConverter):
|
||||||
@@ -13,7 +14,9 @@ class ImageConverter(MediaConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not an image
|
# Bail if not an image
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
||||||
@@ -21,8 +24,9 @@ class ImageConverter(MediaConverter):
|
|||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
# Add metadata if a local path is provided
|
||||||
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
if input.input_type == "filepath":
|
||||||
|
metadata = self._get_metadata(input.filepath, kwargs.get("exiftool_path"))
|
||||||
|
|
||||||
if metadata:
|
if metadata:
|
||||||
for f in [
|
for f in [
|
||||||
@@ -47,7 +51,7 @@ class ImageConverter(MediaConverter):
|
|||||||
md_content += (
|
md_content += (
|
||||||
"\n# Description:\n"
|
"\n# Description:\n"
|
||||||
+ self._get_llm_description(
|
+ self._get_llm_description(
|
||||||
local_path,
|
input,
|
||||||
extension,
|
extension,
|
||||||
llm_client,
|
llm_client,
|
||||||
llm_model,
|
llm_model,
|
||||||
@@ -61,17 +65,20 @@ class ImageConverter(MediaConverter):
|
|||||||
text_content=md_content,
|
text_content=md_content,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
|
def _get_llm_description(
|
||||||
|
self, input: ConverterInput, extension, client, model, prompt=None
|
||||||
|
):
|
||||||
if prompt is None or prompt.strip() == "":
|
if prompt is None or prompt.strip() == "":
|
||||||
prompt = "Write a detailed caption for this image."
|
prompt = "Write a detailed caption for this image."
|
||||||
|
|
||||||
data_uri = ""
|
data_uri = ""
|
||||||
with open(local_path, "rb") as image_file:
|
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
||||||
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
if content_type is None:
|
||||||
if content_type is None:
|
content_type = "image/jpeg"
|
||||||
content_type = "image/jpeg"
|
image_file = input.read_file(mode="rb")
|
||||||
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
||||||
data_uri = f"data:{content_type};base64,{image_base64}"
|
image_file.close()
|
||||||
|
data_uri = f"data:{content_type};base64,{image_base64}"
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from ._base import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from .._exceptions import FileConversionException
|
from .._exceptions import FileConversionException
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class IpynbConverter(DocumentConverter):
|
class IpynbConverter(DocumentConverter):
|
||||||
@@ -18,7 +19,7 @@ class IpynbConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not ipynb
|
# Bail if not ipynb
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -27,9 +28,10 @@ class IpynbConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Parse and convert the notebook
|
# Parse and convert the notebook
|
||||||
result = None
|
result = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
notebook_content = json.load(fh)
|
notebook_content = json.load(file_obj)
|
||||||
result = self._convert(notebook_content)
|
file_obj.close()
|
||||||
|
result = self._convert(notebook_content)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
import tempfile
|
import tempfile
|
||||||
|
import os
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._wav_converter import WavConverter
|
from ._wav_converter import WavConverter
|
||||||
from warnings import resetwarnings, catch_warnings
|
from warnings import resetwarnings, catch_warnings
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||||
@@ -33,12 +35,19 @@ class Mp3Converter(WavConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a MP3
|
# Bail if not a MP3
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".mp3":
|
if extension.lower() != ".mp3":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Bail if a local path was not provided
|
||||||
|
if input.input_type != "filepath":
|
||||||
|
return None
|
||||||
|
local_path = input.filepath
|
||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
# Add metadata
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import olefile
|
import olefile
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class OutlookMsgConverter(DocumentConverter):
|
class OutlookMsgConverter(DocumentConverter):
|
||||||
@@ -17,7 +18,7 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a MSG file
|
# Bail if not a MSG file
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -25,7 +26,9 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
msg = olefile.OleFileIO(local_path)
|
file_obj = input.read_file(mode="rb")
|
||||||
|
msg = olefile.OleFileIO(file_obj)
|
||||||
|
|
||||||
# Extract email metadata
|
# Extract email metadata
|
||||||
md_content = "# Email Message\n\n"
|
md_content = "# Email Message\n\n"
|
||||||
|
|
||||||
@@ -49,6 +52,7 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
md_content += body
|
md_content += body
|
||||||
|
|
||||||
msg.close()
|
msg.close()
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=headers.get("Subject"), text_content=md_content.strip()
|
title=headers.get("Subject"), text_content=md_content.strip()
|
||||||
@@ -56,7 +60,7 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise FileConversionException(
|
raise FileConversionException(
|
||||||
f"Could not convert MSG file '{local_path}': {str(e)}"
|
f"Could not convert MSG file '{input.filepath}': {str(e)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_stream_data(
|
def _get_stream_data(
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
import pdfminer
|
import pdfminer
|
||||||
import pdfminer.high_level
|
import pdfminer.high_level
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
from io import StringIO
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class PdfConverter(DocumentConverter):
|
class PdfConverter(DocumentConverter):
|
||||||
@@ -14,13 +16,20 @@ class PdfConverter(DocumentConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a PDF
|
# Bail if not a PDF
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".pdf":
|
if extension.lower() != ".pdf":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
output = StringIO()
|
||||||
|
file_obj = input.read_file(mode="rb")
|
||||||
|
pdfminer.high_level.extract_text_to_fp(file_obj, output)
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=pdfminer.high_level.extract_text(local_path),
|
text_content=output.getvalue(),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
from charset_normalizer import from_path
|
from charset_normalizer import from_path, from_bytes
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class PlainTextConverter(DocumentConverter):
|
class PlainTextConverter(DocumentConverter):
|
||||||
@@ -15,8 +16,11 @@ class PlainTextConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Read file object from input
|
||||||
|
file_obj = input.read_file(mode="rb")
|
||||||
|
|
||||||
# Guess the content type from any file extension that might be around
|
# Guess the content type from any file extension that might be around
|
||||||
content_type, _ = mimetypes.guess_type(
|
content_type, _ = mimetypes.guess_type(
|
||||||
"__placeholder" + kwargs.get("file_extension", "")
|
"__placeholder" + kwargs.get("file_extension", "")
|
||||||
@@ -31,7 +35,8 @@ class PlainTextConverter(DocumentConverter):
|
|||||||
):
|
):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
text_content = str(from_path(local_path).best())
|
text_content = str(from_bytes(file_obj.read()).best())
|
||||||
|
file_obj.close()
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=text_content,
|
text_content=text_content,
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from typing import Union
|
|||||||
|
|
||||||
from ._base import DocumentConverterResult, DocumentConverter
|
from ._base import DocumentConverterResult, DocumentConverter
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class PptxConverter(HtmlConverter):
|
class PptxConverter(HtmlConverter):
|
||||||
@@ -48,7 +49,9 @@ class PptxConverter(HtmlConverter):
|
|||||||
)
|
)
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a PPTX
|
# Bail if not a PPTX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".pptx":
|
if extension.lower() != ".pptx":
|
||||||
@@ -56,7 +59,10 @@ class PptxConverter(HtmlConverter):
|
|||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
presentation = pptx.Presentation(local_path)
|
file_obj = input.read_file(mode="rb")
|
||||||
|
presentation = pptx.Presentation(file_obj)
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
slide_num = 0
|
slide_num = 0
|
||||||
for slide in presentation.slides:
|
for slide in presentation.slides:
|
||||||
slide_num += 1
|
slide_num += 1
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class RssConverter(DocumentConverter):
|
class RssConverter(DocumentConverter):
|
||||||
@@ -15,16 +16,21 @@ class RssConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs
|
self, input: ConverterInput, **kwargs
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not RSS type
|
# Bail if not RSS type
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() not in [".xml", ".rss", ".atom"]:
|
if extension.lower() not in [".xml", ".rss", ".atom"]:
|
||||||
return None
|
return None
|
||||||
|
# Read file object from input
|
||||||
|
file_obj = input.read_file(mode="rb")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
doc = minidom.parse(local_path)
|
doc = minidom.parse(file_obj)
|
||||||
except BaseException as _:
|
except BaseException as _:
|
||||||
return None
|
return None
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
result = None
|
result = None
|
||||||
if doc.getElementsByTagName("rss"):
|
if doc.getElementsByTagName("rss"):
|
||||||
# A RSS feed must have a root element of <rss>
|
# A RSS feed must have a root element of <rss>
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||||
@@ -22,12 +23,19 @@ class WavConverter(MediaConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a WAV
|
# Bail if not a WAV
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".wav":
|
if extension.lower() != ".wav":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Bail if a local path was not provided
|
||||||
|
if input.input_type != "filepath":
|
||||||
|
return None
|
||||||
|
local_path = input.filepath
|
||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
# Add metadata
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class WikipediaConverter(DocumentConverter):
|
class WikipediaConverter(DocumentConverter):
|
||||||
@@ -16,7 +17,7 @@ class WikipediaConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not Wikipedia
|
# Bail if not Wikipedia
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -28,8 +29,9 @@ class WikipediaConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Parse the file
|
# Parse the file
|
||||||
soup = None
|
soup = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import pandas as pd
|
|||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class XlsxConverter(HtmlConverter):
|
class XlsxConverter(HtmlConverter):
|
||||||
@@ -16,13 +17,18 @@ class XlsxConverter(HtmlConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a XLSX
|
# Bail if not a XLSX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".xlsx":
|
if extension.lower() != ".xlsx":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
file_obj = input.read_file(mode="rb")
|
||||||
|
sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
@@ -40,13 +46,18 @@ class XlsConverter(HtmlConverter):
|
|||||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a XLS
|
# Bail if not a XLS
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".xls":
|
if extension.lower() != ".xls":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
file_obj = input.read_file(mode="rb")
|
||||||
|
sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
import re
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
from typing import Any, Union, Dict, List
|
from typing import Any, Union, Dict, List
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
# Optional YouTube transcription support
|
# Optional YouTube transcription support
|
||||||
@@ -25,7 +27,7 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not YouTube
|
# Bail if not YouTube
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -37,8 +39,9 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Parse the file
|
# Parse the file
|
||||||
soup = None
|
soup = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
# Read the meta tags
|
# Read the meta tags
|
||||||
assert soup.title is not None and soup.title.string is not None
|
assert soup.title is not None and soup.title.string is not None
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import shutil
|
|||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class ZipConverter(DocumentConverter):
|
class ZipConverter(DocumentConverter):
|
||||||
@@ -51,13 +52,18 @@ class ZipConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a ZIP
|
# Bail if not a ZIP
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".zip":
|
if extension.lower() != ".zip":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Bail if a local path is not provided
|
||||||
|
if input.input_type != "filepath":
|
||||||
|
return None
|
||||||
|
local_path = input.filepath
|
||||||
|
|
||||||
# Get parent converters list if available
|
# Get parent converters list if available
|
||||||
parent_converters = kwargs.get("_parent_converters", [])
|
parent_converters = kwargs.get("_parent_converters", [])
|
||||||
if not parent_converters:
|
if not parent_converters:
|
||||||
@@ -111,7 +117,11 @@ class ZipConverter(DocumentConverter):
|
|||||||
if isinstance(converter, ZipConverter):
|
if isinstance(converter, ZipConverter):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
result = converter.convert(file_path, **file_kwargs)
|
# Create a ConverterInput for the parent converter and attempt conversion
|
||||||
|
input = ConverterInput(
|
||||||
|
input_type="filepath", filepath=file_path
|
||||||
|
)
|
||||||
|
result = converter.convert(input, **file_kwargs)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
md_content += f"\n## File: {relative_path}\n\n"
|
md_content += f"\n## File: {relative_path}\n\n"
|
||||||
md_content += result.text_content + "\n\n"
|
md_content += result.text_content + "\n\n"
|
||||||
|
|||||||
@@ -189,7 +189,7 @@ def test_markitdown_remote() -> None:
|
|||||||
# assert test_string in result.text_content
|
# assert test_string in result.text_content
|
||||||
|
|
||||||
|
|
||||||
def test_markitdown_local() -> None:
|
def test_markitdown_local_paths() -> None:
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
# Test XLSX processing
|
# Test XLSX processing
|
||||||
@@ -272,6 +272,87 @@ def test_markitdown_local() -> None:
|
|||||||
assert "# Test" in result.text_content
|
assert "# Test" in result.text_content
|
||||||
|
|
||||||
|
|
||||||
|
def test_markitdown_local_objects() -> None:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test XLSX processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".xlsx")
|
||||||
|
validate_strings(result, XLSX_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test XLS processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".xls")
|
||||||
|
for test_string in XLS_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test DOCX processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".docx")
|
||||||
|
validate_strings(result, DOCX_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test DOCX processing, with comments
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
|
||||||
|
result = markitdown.convert(
|
||||||
|
f,
|
||||||
|
file_extension=".docx",
|
||||||
|
style_map="comment-reference => ",
|
||||||
|
)
|
||||||
|
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test DOCX processing, with comments and setting style_map on init
|
||||||
|
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
|
||||||
|
result = markitdown_with_style_map.convert(f, file_extension=".docx")
|
||||||
|
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test PPTX processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".pptx")
|
||||||
|
validate_strings(result, PPTX_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test HTML processing
|
||||||
|
with open(
|
||||||
|
os.path.join(TEST_FILES_DIR, "test_blog.html"), "rt", encoding="utf-8"
|
||||||
|
) as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL)
|
||||||
|
validate_strings(result, BLOG_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test Wikipedia processing
|
||||||
|
with open(
|
||||||
|
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rt", encoding="utf-8"
|
||||||
|
) as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL)
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
|
||||||
|
|
||||||
|
# Test Bing processing
|
||||||
|
with open(
|
||||||
|
os.path.join(TEST_FILES_DIR, "test_serp.html"), "rt", encoding="utf-8"
|
||||||
|
) as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL)
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
|
||||||
|
|
||||||
|
# Test RSS processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".xml")
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
for test_string in RSS_TEST_STRINGS:
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test MSG (Outlook email) processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".msg")
|
||||||
|
validate_strings(result, MSG_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test JSON processing
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f:
|
||||||
|
result = markitdown.convert(f, file_extension=".json")
|
||||||
|
validate_strings(result, JSON_TEST_STRINGS)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_exiftool,
|
skip_exiftool,
|
||||||
reason="do not run if exiftool is not installed",
|
reason="do not run if exiftool is not installed",
|
||||||
@@ -328,7 +409,8 @@ def test_markitdown_llm() -> None:
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
test_markitdown_local()
|
test_markitdown_local_paths()
|
||||||
|
test_markitdown_local_objects()
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
# test_markitdown_llm()
|
# test_markitdown_llm()
|
||||||
print("All tests passed!")
|
print("All tests passed!")
|
||||||
|
|||||||
Reference in New Issue
Block a user