refactored more converters to support input class
This commit is contained in:
@@ -1,8 +1,10 @@
|
||||
import tempfile
|
||||
import os
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._wav_converter import WavConverter
|
||||
from warnings import resetwarnings, catch_warnings
|
||||
from ._converter_input import ConverterInput
|
||||
|
||||
# Optional Transcription support
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||
@@ -33,12 +35,17 @@ class Mp3Converter(WavConverter):
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a MP3
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".mp3":
|
||||
return None
|
||||
|
||||
# Bail if a local path was not provided
|
||||
if input.input_type != "filepath":
|
||||
return None
|
||||
local_path = input.filepath
|
||||
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import olefile
|
||||
from typing import Any, Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._converter_input import ConverterInput
|
||||
|
||||
|
||||
class OutlookMsgConverter(DocumentConverter):
|
||||
@@ -17,7 +18,7 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
self, input: ConverterInput, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a MSG file
|
||||
extension = kwargs.get("file_extension", "")
|
||||
@@ -25,7 +26,8 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
return None
|
||||
|
||||
try:
|
||||
msg = olefile.OleFileIO(local_path)
|
||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||
msg = olefile.OleFileIO(file_obj)
|
||||
# Extract email metadata
|
||||
md_content = "# Email Message\n\n"
|
||||
|
||||
@@ -56,7 +58,7 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
|
||||
except Exception as e:
|
||||
raise FileConversionException(
|
||||
f"Could not convert MSG file '{local_path}': {str(e)}"
|
||||
f"Could not convert MSG file '{input.filepath}': {str(e)}"
|
||||
)
|
||||
|
||||
def _get_stream_data(
|
||||
|
||||
@@ -4,6 +4,7 @@ from charset_normalizer import from_path
|
||||
from typing import Any, Union
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._converter_input import ConverterInput
|
||||
|
||||
|
||||
class PlainTextConverter(DocumentConverter):
|
||||
@@ -15,8 +16,13 @@ class PlainTextConverter(DocumentConverter):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
self, input: ConverterInput, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if a local path is not provided
|
||||
if input.input_type != "filepath":
|
||||
return None
|
||||
local_path = input.filepath
|
||||
|
||||
# Guess the content type from any file extension that might be around
|
||||
content_type, _ = mimetypes.guess_type(
|
||||
"__placeholder" + kwargs.get("file_extension", "")
|
||||
|
||||
@@ -7,6 +7,7 @@ from typing import Union
|
||||
|
||||
from ._base import DocumentConverterResult, DocumentConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._converter_input import ConverterInput
|
||||
|
||||
|
||||
class PptxConverter(HtmlConverter):
|
||||
@@ -48,7 +49,7 @@ class PptxConverter(HtmlConverter):
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a PPTX
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".pptx":
|
||||
@@ -56,7 +57,8 @@ class PptxConverter(HtmlConverter):
|
||||
|
||||
md_content = ""
|
||||
|
||||
presentation = pptx.Presentation(local_path)
|
||||
file_obj = input.read_file(mode="rb")
|
||||
presentation = pptx.Presentation(file_obj)
|
||||
slide_num = 0
|
||||
for slide in presentation.slides:
|
||||
slide_num += 1
|
||||
|
||||
@@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._converter_input import ConverterInput
|
||||
|
||||
|
||||
class RssConverter(DocumentConverter):
|
||||
@@ -15,12 +16,17 @@ class RssConverter(DocumentConverter):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs
|
||||
self, input: ConverterInput, **kwargs
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not RSS type
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".xml", ".rss", ".atom"]:
|
||||
return None
|
||||
# Bail if a local path is not provided
|
||||
if input.input_type != "filepath":
|
||||
return None
|
||||
local_path = input.filepath
|
||||
|
||||
try:
|
||||
doc = minidom.parse(local_path)
|
||||
except BaseException as _:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._media_converter import MediaConverter
|
||||
from ._converter_input import ConverterInput
|
||||
|
||||
# Optional Transcription support
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||
@@ -22,11 +23,16 @@ class WavConverter(MediaConverter):
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a WAV
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".wav":
|
||||
return None
|
||||
|
||||
# Bail if a local path was not provided
|
||||
if input.input_type != "filepath":
|
||||
return None
|
||||
local_path = input.filepath
|
||||
|
||||
md_content = ""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user