From 096fef3d5f3534e712b5dfbc4d96a2d2a628f4cb Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Wed, 19 Feb 2025 13:34:28 -0500 Subject: [PATCH] refactored more converters to support input class --- .../src/markitdown/converters/_mp3_converter.py | 9 ++++++++- .../src/markitdown/converters/_outlook_msg_converter.py | 8 +++++--- .../src/markitdown/converters/_plain_text_converter.py | 8 +++++++- .../src/markitdown/converters/_pptx_converter.py | 6 ++++-- .../src/markitdown/converters/_rss_converter.py | 8 +++++++- .../src/markitdown/converters/_wav_converter.py | 8 +++++++- 6 files changed, 38 insertions(+), 9 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_mp3_converter.py b/packages/markitdown/src/markitdown/converters/_mp3_converter.py index 91fd270..b1afec4 100644 --- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py +++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py @@ -1,8 +1,10 @@ import tempfile +import os from typing import Union from ._base import DocumentConverter, DocumentConverterResult from ._wav_converter import WavConverter from warnings import resetwarnings, catch_warnings +from ._converter_input import ConverterInput # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False @@ -33,12 +35,17 @@ class Mp3Converter(WavConverter): ): super().__init__(priority=priority) - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a MP3 extension = kwargs.get("file_extension", "") if extension.lower() != ".mp3": return None + # Bail if a local path was not provided + if input.input_type != "filepath": + return None + local_path = input.filepath + md_content = "" # Add metadata diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index 6764fc5..f0c33fe 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -1,6 +1,7 @@ import olefile from typing import Any, Union from ._base import DocumentConverter, DocumentConverterResult +from ._converter_input import ConverterInput class OutlookMsgConverter(DocumentConverter): @@ -17,7 +18,7 @@ class OutlookMsgConverter(DocumentConverter): super().__init__(priority=priority) def convert( - self, local_path: str, **kwargs: Any + self, input: ConverterInput, **kwargs: Any ) -> Union[None, DocumentConverterResult]: # Bail if not a MSG file extension = kwargs.get("file_extension", "") @@ -25,7 +26,8 @@ class OutlookMsgConverter(DocumentConverter): return None try: - msg = olefile.OleFileIO(local_path) + file_obj = input.read_file(mode="rt", encoding="utf-8") + msg = olefile.OleFileIO(file_obj) # Extract email metadata md_content = "# Email Message\n\n" @@ -56,7 +58,7 @@ class OutlookMsgConverter(DocumentConverter): except Exception as e: raise FileConversionException( - f"Could not convert MSG file '{local_path}': {str(e)}" + f"Could not convert MSG file '{input.filepath}': {str(e)}" ) def _get_stream_data( diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 75f74a8..2bc71ce 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -4,6 +4,7 @@ from charset_normalizer import from_path from typing import Any, Union from ._base import DocumentConverter, DocumentConverterResult +from ._converter_input import ConverterInput class PlainTextConverter(DocumentConverter): @@ -15,8 +16,13 @@ class PlainTextConverter(DocumentConverter): super().__init__(priority=priority) def convert( - self, local_path: str, **kwargs: Any + self, input: ConverterInput, **kwargs: Any ) -> Union[None, DocumentConverterResult]: + # Bail if a local path is not provided + if input.input_type != "filepath": + return None + local_path = input.filepath + # Guess the content type from any file extension that might be around content_type, _ = mimetypes.guess_type( "__placeholder" + kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index afb37a0..a5ee72d 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -7,6 +7,7 @@ from typing import Union from ._base import DocumentConverterResult, DocumentConverter from ._html_converter import HtmlConverter +from ._converter_input import ConverterInput class PptxConverter(HtmlConverter): @@ -48,7 +49,7 @@ class PptxConverter(HtmlConverter): ) return response.choices[0].message.content - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a PPTX extension = kwargs.get("file_extension", "") if extension.lower() != ".pptx": @@ -56,7 +57,8 @@ class PptxConverter(HtmlConverter): md_content = "" - presentation = pptx.Presentation(local_path) + file_obj = input.read_file(mode="rb") + presentation = pptx.Presentation(file_obj) slide_num = 0 for slide in presentation.slides: slide_num += 1 diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index b279c85..89f41c0 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -4,6 +4,7 @@ from bs4 import BeautifulSoup from ._markdownify import _CustomMarkdownify from ._base import DocumentConverter, DocumentConverterResult +from ._converter_input import ConverterInput class RssConverter(DocumentConverter): @@ -15,12 +16,17 @@ class RssConverter(DocumentConverter): super().__init__(priority=priority) def convert( - self, local_path: str, **kwargs + self, input: ConverterInput, **kwargs ) -> Union[None, DocumentConverterResult]: # Bail if not RSS type extension = kwargs.get("file_extension", "") if extension.lower() not in [".xml", ".rss", ".atom"]: return None + # Bail if a local path is not provided + if input.input_type != "filepath": + return None + local_path = input.filepath + try: doc = minidom.parse(local_path) except BaseException as _: diff --git a/packages/markitdown/src/markitdown/converters/_wav_converter.py b/packages/markitdown/src/markitdown/converters/_wav_converter.py index 3c8d842..9c602d7 100644 --- a/packages/markitdown/src/markitdown/converters/_wav_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py @@ -1,6 +1,7 @@ from typing import Union from ._base import DocumentConverter, DocumentConverterResult from ._media_converter import MediaConverter +from ._converter_input import ConverterInput # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False @@ -22,11 +23,16 @@ class WavConverter(MediaConverter): ): super().__init__(priority=priority) - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a WAV extension = kwargs.get("file_extension", "") if extension.lower() != ".wav": return None + + # Bail if a local path was not provided + if input.input_type != "filepath": + return None + local_path = input.filepath md_content = ""