black formatting
This commit is contained in:
@@ -175,7 +175,9 @@ class MarkItDown:
|
|||||||
warn("Plugins converters are already enabled.", RuntimeWarning)
|
warn("Plugins converters are already enabled.", RuntimeWarning)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase], **kwargs: Any
|
self,
|
||||||
|
source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase],
|
||||||
|
**kwargs: Any,
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
@@ -222,10 +224,10 @@ class MarkItDown:
|
|||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
return self._convert(input, extensions, **kwargs)
|
return self._convert(input, extensions, **kwargs)
|
||||||
|
|
||||||
def convert_file_object(
|
def convert_file_object(
|
||||||
self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
|
self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
|
||||||
) -> DocumentConverterResult: #TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
# Prepare a list of extensions to try (in order of priority)
|
||||||
ext = kwargs.get("file_extension")
|
ext = kwargs.get("file_extension")
|
||||||
extensions = [ext] if ext is not None else []
|
extensions = [ext] if ext is not None else []
|
||||||
@@ -417,7 +419,7 @@ class MarkItDown:
|
|||||||
# Use puremagic to guess
|
# Use puremagic to guess
|
||||||
try:
|
try:
|
||||||
guesses = []
|
guesses = []
|
||||||
|
|
||||||
# Guess extensions for filepaths
|
# Guess extensions for filepaths
|
||||||
if isinstance(source, str):
|
if isinstance(source, str):
|
||||||
guesses = puremagic.magic_file(source)
|
guesses = puremagic.magic_file(source)
|
||||||
|
|||||||
@@ -22,7 +22,9 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a Bing SERP
|
# Bail if not a Bing SERP
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
if extension.lower() not in [".html", ".htm"]:
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
|
|
||||||
class ConverterInput:
|
class ConverterInput:
|
||||||
"""
|
"""
|
||||||
Wrapper for inputs to converter functions.
|
Wrapper for inputs to converter functions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
input_type: str = "filepath",
|
input_type: str = "filepath",
|
||||||
@@ -12,17 +14,17 @@ class ConverterInput:
|
|||||||
):
|
):
|
||||||
if input_type not in ["filepath", "object"]:
|
if input_type not in ["filepath", "object"]:
|
||||||
raise ValueError(f"Invalid converter input type: {input_type}")
|
raise ValueError(f"Invalid converter input type: {input_type}")
|
||||||
|
|
||||||
self.input_type = input_type
|
self.input_type = input_type
|
||||||
self.filepath = filepath
|
self.filepath = filepath
|
||||||
self.file_object = file_object
|
self.file_object = file_object
|
||||||
|
|
||||||
def read_file(
|
def read_file(
|
||||||
self,
|
self,
|
||||||
mode: str = 'rb',
|
mode: str = "rb",
|
||||||
encoding: Union[str, None] = None,
|
encoding: Union[str, None] = None,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
if self.input_type == "object":
|
if self.input_type == "object":
|
||||||
return self.file_object
|
return self.file_object
|
||||||
|
|
||||||
return open(self.filepath, mode=mode, encoding=encoding)
|
return open(self.filepath, mode=mode, encoding=encoding)
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# Get the bytestring from the converter input
|
# Get the bytestring from the converter input
|
||||||
file_obj = input.read_file(mode='rb')
|
file_obj = input.read_file(mode="rb")
|
||||||
file_bytes = file_obj.read()
|
file_bytes = file_obj.read()
|
||||||
file_obj.close()
|
file_obj.close()
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,9 @@ class DocxConverter(HtmlConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a DOCX
|
# Bail if not a DOCX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".docx":
|
if extension.lower() != ".docx":
|
||||||
|
|||||||
@@ -14,7 +14,9 @@ class ImageConverter(MediaConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not an image
|
# Bail if not an image
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
||||||
@@ -63,7 +65,9 @@ class ImageConverter(MediaConverter):
|
|||||||
text_content=md_content,
|
text_content=md_content,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_llm_description(self, input: ConverterInput, extension, client, model, prompt=None):
|
def _get_llm_description(
|
||||||
|
self, input: ConverterInput, extension, client, model, prompt=None
|
||||||
|
):
|
||||||
if prompt is None or prompt.strip() == "":
|
if prompt is None or prompt.strip() == "":
|
||||||
prompt = "Write a detailed caption for this image."
|
prompt = "Write a detailed caption for this image."
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,9 @@ class Mp3Converter(WavConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a MP3
|
# Bail if not a MP3
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".mp3":
|
if extension.lower() != ".mp3":
|
||||||
|
|||||||
@@ -16,7 +16,9 @@ class PdfConverter(DocumentConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a PDF
|
# Bail if not a PDF
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".pdf":
|
if extension.lower() != ".pdf":
|
||||||
@@ -31,4 +33,3 @@ class PdfConverter(DocumentConverter):
|
|||||||
title=None,
|
title=None,
|
||||||
text_content=output.getvalue(),
|
text_content=output.getvalue(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class PlainTextConverter(DocumentConverter):
|
|||||||
if input.input_type != "filepath":
|
if input.input_type != "filepath":
|
||||||
return None
|
return None
|
||||||
local_path = input.filepath
|
local_path = input.filepath
|
||||||
|
|
||||||
# Guess the content type from any file extension that might be around
|
# Guess the content type from any file extension that might be around
|
||||||
content_type, _ = mimetypes.guess_type(
|
content_type, _ = mimetypes.guess_type(
|
||||||
"__placeholder" + kwargs.get("file_extension", "")
|
"__placeholder" + kwargs.get("file_extension", "")
|
||||||
|
|||||||
@@ -49,7 +49,9 @@ class PptxConverter(HtmlConverter):
|
|||||||
)
|
)
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|
||||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a PPTX
|
# Bail if not a PPTX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".pptx":
|
if extension.lower() != ".pptx":
|
||||||
@@ -60,7 +62,7 @@ class PptxConverter(HtmlConverter):
|
|||||||
file_obj = input.read_file(mode="rb")
|
file_obj = input.read_file(mode="rb")
|
||||||
presentation = pptx.Presentation(file_obj)
|
presentation = pptx.Presentation(file_obj)
|
||||||
file_obj.close()
|
file_obj.close()
|
||||||
|
|
||||||
slide_num = 0
|
slide_num = 0
|
||||||
for slide in presentation.slides:
|
for slide in presentation.slides:
|
||||||
slide_num += 1
|
slide_num += 1
|
||||||
|
|||||||
@@ -23,12 +23,14 @@ class WavConverter(MediaConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a WAV
|
# Bail if not a WAV
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".wav":
|
if extension.lower() != ".wav":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Bail if a local path was not provided
|
# Bail if a local path was not provided
|
||||||
if input.input_type != "filepath":
|
if input.input_type != "filepath":
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -17,7 +17,9 @@ class XlsxConverter(HtmlConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a XLSX
|
# Bail if not a XLSX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".xlsx":
|
if extension.lower() != ".xlsx":
|
||||||
@@ -44,7 +46,9 @@ class XlsConverter(HtmlConverter):
|
|||||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self, input: ConverterInput, **kwargs
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a XLS
|
# Bail if not a XLS
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".xls":
|
if extension.lower() != ".xls":
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ class ZipConverter(DocumentConverter):
|
|||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".zip":
|
if extension.lower() != ".zip":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Bail if a local path is not provided
|
# Bail if a local path is not provided
|
||||||
if input.input_type != "filepath":
|
if input.input_type != "filepath":
|
||||||
return None
|
return None
|
||||||
@@ -116,7 +116,7 @@ class ZipConverter(DocumentConverter):
|
|||||||
# Skip the zip converter to avoid infinite recursion
|
# Skip the zip converter to avoid infinite recursion
|
||||||
if isinstance(converter, ZipConverter):
|
if isinstance(converter, ZipConverter):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Create a ConverterInput for the parent converter and attempt conversion
|
# Create a ConverterInput for the parent converter and attempt conversion
|
||||||
input = ConverterInput(
|
input = ConverterInput(
|
||||||
input_type="filepath", filepath=file_path
|
input_type="filepath", filepath=file_path
|
||||||
|
|||||||
Reference in New Issue
Block a user