black formatting

This commit is contained in:
Kenny Zhang
2025-02-20 16:49:37 -05:00
parent b8b3897952
commit 07a28d4f00
13 changed files with 48 additions and 25 deletions

View File

@@ -175,7 +175,9 @@ class MarkItDown:
warn("Plugins converters are already enabled.", RuntimeWarning) warn("Plugins converters are already enabled.", RuntimeWarning)
def convert( def convert(
self, source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase], **kwargs: Any self,
source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase],
**kwargs: Any,
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
""" """
Args: Args:
@@ -225,7 +227,7 @@ class MarkItDown:
def convert_file_object( def convert_file_object(
self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
) -> DocumentConverterResult: #TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
# Prepare a list of extensions to try (in order of priority) # Prepare a list of extensions to try (in order of priority)
ext = kwargs.get("file_extension") ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else [] extensions = [ext] if ext is not None else []

View File

@@ -22,7 +22,9 @@ class BingSerpConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a Bing SERP # Bail if not a Bing SERP
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".html", ".htm"]: if extension.lower() not in [".html", ".htm"]:

View File

@@ -1,9 +1,11 @@
from typing import Any, Union from typing import Any, Union
class ConverterInput: class ConverterInput:
""" """
Wrapper for inputs to converter functions. Wrapper for inputs to converter functions.
""" """
def __init__( def __init__(
self, self,
input_type: str = "filepath", input_type: str = "filepath",
@@ -19,7 +21,7 @@ class ConverterInput:
def read_file( def read_file(
self, self,
mode: str = 'rb', mode: str = "rb",
encoding: Union[str, None] = None, encoding: Union[str, None] = None,
) -> Any: ) -> Any:
if self.input_type == "object": if self.input_type == "object":

View File

@@ -61,7 +61,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
return None return None
# Get the bytestring from the converter input # Get the bytestring from the converter input
file_obj = input.read_file(mode='rb') file_obj = input.read_file(mode="rb")
file_bytes = file_obj.read() file_bytes = file_obj.read()
file_obj.close() file_obj.close()

View File

@@ -21,7 +21,9 @@ class DocxConverter(HtmlConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX # Bail if not a DOCX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".docx": if extension.lower() != ".docx":

View File

@@ -14,7 +14,9 @@ class ImageConverter(MediaConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not an image # Bail if not an image
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".jpg", ".jpeg", ".png"]: if extension.lower() not in [".jpg", ".jpeg", ".png"]:
@@ -63,7 +65,9 @@ class ImageConverter(MediaConverter):
text_content=md_content, text_content=md_content,
) )
def _get_llm_description(self, input: ConverterInput, extension, client, model, prompt=None): def _get_llm_description(
self, input: ConverterInput, extension, client, model, prompt=None
):
if prompt is None or prompt.strip() == "": if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image." prompt = "Write a detailed caption for this image."

View File

@@ -35,7 +35,9 @@ class Mp3Converter(WavConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a MP3 # Bail if not a MP3
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".mp3": if extension.lower() != ".mp3":

View File

@@ -16,7 +16,9 @@ class PdfConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF # Bail if not a PDF
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf": if extension.lower() != ".pdf":
@@ -31,4 +33,3 @@ class PdfConverter(DocumentConverter):
title=None, title=None,
text_content=output.getvalue(), text_content=output.getvalue(),
) )

View File

@@ -49,7 +49,9 @@ class PptxConverter(HtmlConverter):
) )
return response.choices[0].message.content return response.choices[0].message.content
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a PPTX # Bail if not a PPTX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".pptx": if extension.lower() != ".pptx":

View File

@@ -23,7 +23,9 @@ class WavConverter(MediaConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a WAV # Bail if not a WAV
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".wav": if extension.lower() != ".wav":

View File

@@ -17,7 +17,9 @@ class XlsxConverter(HtmlConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX # Bail if not a XLSX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".xlsx": if extension.lower() != ".xlsx":
@@ -44,7 +46,9 @@ class XlsConverter(HtmlConverter):
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
""" """
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a XLS # Bail if not a XLS
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".xls": if extension.lower() != ".xls":