refactored more converters to support input class

This commit is contained in:
Kenny Zhang
2025-02-19 13:34:28 -05:00
parent 52cbff061a
commit 096fef3d5f
6 changed files with 38 additions and 9 deletions

View File

@@ -1,8 +1,10 @@
import tempfile
import os
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings
from ._converter_input import ConverterInput
# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
@@ -33,12 +35,17 @@ class Mp3Converter(WavConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a MP3
extension = kwargs.get("file_extension", "")
if extension.lower() != ".mp3":
return None
# Bail if a local path was not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
md_content = ""
# Add metadata

View File

@@ -1,6 +1,7 @@
import olefile
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class OutlookMsgConverter(DocumentConverter):
@@ -17,7 +18,7 @@ class OutlookMsgConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a MSG file
extension = kwargs.get("file_extension", "")
@@ -25,7 +26,8 @@ class OutlookMsgConverter(DocumentConverter):
return None
try:
msg = olefile.OleFileIO(local_path)
file_obj = input.read_file(mode="rt", encoding="utf-8")
msg = olefile.OleFileIO(file_obj)
# Extract email metadata
md_content = "# Email Message\n\n"
@@ -56,7 +58,7 @@ class OutlookMsgConverter(DocumentConverter):
except Exception as e:
raise FileConversionException(
f"Could not convert MSG file '{local_path}': {str(e)}"
f"Could not convert MSG file '{input.filepath}': {str(e)}"
)
def _get_stream_data(

View File

@@ -4,6 +4,7 @@ from charset_normalizer import from_path
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class PlainTextConverter(DocumentConverter):
@@ -15,8 +16,13 @@ class PlainTextConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if a local path is not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
# Guess the content type from any file extension that might be around
content_type, _ = mimetypes.guess_type(
"__placeholder" + kwargs.get("file_extension", "")

View File

@@ -7,6 +7,7 @@ from typing import Union
from ._base import DocumentConverterResult, DocumentConverter
from ._html_converter import HtmlConverter
from ._converter_input import ConverterInput
class PptxConverter(HtmlConverter):
@@ -48,7 +49,7 @@ class PptxConverter(HtmlConverter):
)
return response.choices[0].message.content
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PPTX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pptx":
@@ -56,7 +57,8 @@ class PptxConverter(HtmlConverter):
md_content = ""
presentation = pptx.Presentation(local_path)
file_obj = input.read_file(mode="rb")
presentation = pptx.Presentation(file_obj)
slide_num = 0
for slide in presentation.slides:
slide_num += 1

View File

@@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
from ._markdownify import _CustomMarkdownify
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class RssConverter(DocumentConverter):
@@ -15,12 +16,17 @@ class RssConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not RSS type
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".xml", ".rss", ".atom"]:
return None
# Bail if a local path is not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
try:
doc = minidom.parse(local_path)
except BaseException as _:

View File

@@ -1,6 +1,7 @@
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter
from ._converter_input import ConverterInput
# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
@@ -22,11 +23,16 @@ class WavConverter(MediaConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a WAV
extension = kwargs.get("file_extension", "")
if extension.lower() != ".wav":
return None
# Bail if a local path was not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
md_content = ""