From 07a28d4f0003dd96b6077f9d115bd96225550559 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Thu, 20 Feb 2025 16:49:37 -0500 Subject: [PATCH] black formatting --- packages/markitdown/src/markitdown/_markitdown.py | 10 ++++++---- .../src/markitdown/converters/_bing_serp_converter.py | 4 +++- .../src/markitdown/converters/_converter_input.py | 10 ++++++---- .../src/markitdown/converters/_doc_intel_converter.py | 2 +- .../src/markitdown/converters/_docx_converter.py | 4 +++- .../src/markitdown/converters/_image_converter.py | 8 ++++++-- .../src/markitdown/converters/_mp3_converter.py | 4 +++- .../src/markitdown/converters/_pdf_converter.py | 5 +++-- .../src/markitdown/converters/_plain_text_converter.py | 2 +- .../src/markitdown/converters/_pptx_converter.py | 6 ++++-- .../src/markitdown/converters/_wav_converter.py | 6 ++++-- .../src/markitdown/converters/_xlsx_converter.py | 8 ++++++-- .../src/markitdown/converters/_zip_converter.py | 4 ++-- 13 files changed, 48 insertions(+), 25 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index b09ed4e..188ab19 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -175,7 +175,9 @@ class MarkItDown: warn("Plugins converters are already enabled.", RuntimeWarning) def convert( - self, source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase], **kwargs: Any + self, + source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase], + **kwargs: Any, ) -> DocumentConverterResult: # TODO: deal with kwargs """ Args: @@ -222,10 +224,10 @@ class MarkItDown: # Convert return self._convert(input, extensions, **kwargs) - + def convert_file_object( self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any - ) -> DocumentConverterResult: #TODO: deal with kwargs + ) -> DocumentConverterResult: # TODO: deal with kwargs # Prepare a list of extensions to try (in order of priority) ext = kwargs.get("file_extension") extensions = [ext] if ext is not None else [] @@ -417,7 +419,7 @@ class MarkItDown: # Use puremagic to guess try: guesses = [] - + # Guess extensions for filepaths if isinstance(source, str): guesses = puremagic.magic_file(source) diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index 36b9a01..156f40f 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -22,7 +22,9 @@ class BingSerpConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, input: ConverterInput, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a Bing SERP extension = kwargs.get("file_extension", "") if extension.lower() not in [".html", ".htm"]: diff --git a/packages/markitdown/src/markitdown/converters/_converter_input.py b/packages/markitdown/src/markitdown/converters/_converter_input.py index e1a1024..a20b67c 100644 --- a/packages/markitdown/src/markitdown/converters/_converter_input.py +++ b/packages/markitdown/src/markitdown/converters/_converter_input.py @@ -1,9 +1,11 @@ from typing import Any, Union + class ConverterInput: """ Wrapper for inputs to converter functions. """ + def __init__( self, input_type: str = "filepath", @@ -12,17 +14,17 @@ class ConverterInput: ): if input_type not in ["filepath", "object"]: raise ValueError(f"Invalid converter input type: {input_type}") - + self.input_type = input_type self.filepath = filepath self.file_object = file_object def read_file( self, - mode: str = 'rb', + mode: str = "rb", encoding: Union[str, None] = None, ) -> Any: if self.input_type == "object": return self.file_object - - return open(self.filepath, mode=mode, encoding=encoding) \ No newline at end of file + + return open(self.filepath, mode=mode, encoding=encoding) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index f411d89..51f4af2 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -61,7 +61,7 @@ class DocumentIntelligenceConverter(DocumentConverter): return None # Get the bytestring from the converter input - file_obj = input.read_file(mode='rb') + file_obj = input.read_file(mode="rb") file_bytes = file_obj.read() file_obj.close() diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index b97aa75..8fc21b5 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -21,7 +21,9 @@ class DocxConverter(HtmlConverter): ): super().__init__(priority=priority) - def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, input: ConverterInput, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a DOCX extension = kwargs.get("file_extension", "") if extension.lower() != ".docx": diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index 197f5cf..3d2f6fe 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -14,7 +14,9 @@ class ImageConverter(MediaConverter): ): super().__init__(priority=priority) - def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, input: ConverterInput, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not an image extension = kwargs.get("file_extension", "") if extension.lower() not in [".jpg", ".jpeg", ".png"]: @@ -63,7 +65,9 @@ class ImageConverter(MediaConverter): text_content=md_content, ) - def _get_llm_description(self, input: ConverterInput, extension, client, model, prompt=None): + def _get_llm_description( + self, input: ConverterInput, extension, client, model, prompt=None + ): if prompt is None or prompt.strip() == "": prompt = "Write a detailed caption for this image." diff --git a/packages/markitdown/src/markitdown/converters/_mp3_converter.py b/packages/markitdown/src/markitdown/converters/_mp3_converter.py index b1afec4..be9be50 100644 --- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py +++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py @@ -35,7 +35,9 @@ class Mp3Converter(WavConverter): ): super().__init__(priority=priority) - def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, input: ConverterInput, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a MP3 extension = kwargs.get("file_extension", "") if extension.lower() != ".mp3": diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index d512eb3..8e8b203 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -16,7 +16,9 @@ class PdfConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, input: ConverterInput, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a PDF extension = kwargs.get("file_extension", "") if extension.lower() != ".pdf": @@ -31,4 +33,3 @@ class PdfConverter(DocumentConverter): title=None, text_content=output.getvalue(), ) - diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 2bc71ce..b23db82 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -22,7 +22,7 @@ class PlainTextConverter(DocumentConverter): if input.input_type != "filepath": return None local_path = input.filepath - + # Guess the content type from any file extension that might be around content_type, _ = mimetypes.guess_type( "__placeholder" + kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 07aa7b3..d75cecf 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -49,7 +49,9 @@ class PptxConverter(HtmlConverter): ) return response.choices[0].message.content - def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, input: ConverterInput, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a PPTX extension = kwargs.get("file_extension", "") if extension.lower() != ".pptx": @@ -60,7 +62,7 @@ class PptxConverter(HtmlConverter): file_obj = input.read_file(mode="rb") presentation = pptx.Presentation(file_obj) file_obj.close() - + slide_num = 0 for slide in presentation.slides: slide_num += 1 diff --git a/packages/markitdown/src/markitdown/converters/_wav_converter.py b/packages/markitdown/src/markitdown/converters/_wav_converter.py index 9c602d7..cefeaa6 100644 --- a/packages/markitdown/src/markitdown/converters/_wav_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py @@ -23,12 +23,14 @@ class WavConverter(MediaConverter): ): super().__init__(priority=priority) - def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, input: ConverterInput, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a WAV extension = kwargs.get("file_extension", "") if extension.lower() != ".wav": return None - + # Bail if a local path was not provided if input.input_type != "filepath": return None diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 18d930f..fa64c41 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -17,7 +17,9 @@ class XlsxConverter(HtmlConverter): ): super().__init__(priority=priority) - def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, input: ConverterInput, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") if extension.lower() != ".xlsx": @@ -44,7 +46,9 @@ class XlsConverter(HtmlConverter): Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. """ - def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, input: ConverterInput, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a XLS extension = kwargs.get("file_extension", "") if extension.lower() != ".xls": diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index c302b73..8891b19 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -58,7 +58,7 @@ class ZipConverter(DocumentConverter): extension = kwargs.get("file_extension", "") if extension.lower() != ".zip": return None - + # Bail if a local path is not provided if input.input_type != "filepath": return None @@ -116,7 +116,7 @@ class ZipConverter(DocumentConverter): # Skip the zip converter to avoid infinite recursion if isinstance(converter, ZipConverter): continue - + # Create a ConverterInput for the parent converter and attempt conversion input = ConverterInput( input_type="filepath", filepath=file_path