begin refactoring converter classes

This commit is contained in:
Kenny Zhang
2025-02-19 11:48:00 -05:00
parent 0027e6d425
commit 52cbff061a
10 changed files with 59 additions and 37 deletions

View File

@@ -10,7 +10,6 @@ from ._exceptions import (
FileConversionException, FileConversionException,
UnsupportedFormatException, UnsupportedFormatException,
) )
from ._input import ConverterInput
from .converters import DocumentConverter, DocumentConverterResult from .converters import DocumentConverter, DocumentConverterResult
__all__ = [ __all__ = [
@@ -22,5 +21,4 @@ __all__ = [
"ConverterPrerequisiteException", "ConverterPrerequisiteException",
"FileConversionException", "FileConversionException",
"UnsupportedFormatException", "UnsupportedFormatException",
"ConverterInput",
] ]

View File

@@ -20,6 +20,7 @@ from ._mp3_converter import Mp3Converter
from ._outlook_msg_converter import OutlookMsgConverter from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter from ._doc_intel_converter import DocumentIntelligenceConverter
from ._converter_input import ConverterInput
__all__ = [ __all__ = [
"DocumentConverter", "DocumentConverter",
@@ -42,4 +43,5 @@ __all__ = [
"OutlookMsgConverter", "OutlookMsgConverter",
"ZipConverter", "ZipConverter",
"DocumentIntelligenceConverter", "DocumentIntelligenceConverter",
"ConverterInput",
] ]

View File

@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
from ._converter_input import ConverterInput
class BingSerpConverter(DocumentConverter): class BingSerpConverter(DocumentConverter):
@@ -21,7 +22,7 @@ class BingSerpConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a Bing SERP # Bail if not a Bing SERP
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".html", ".htm"]: if extension.lower() not in [".html", ".htm"]:
@@ -36,8 +37,8 @@ class BingSerpConverter(DocumentConverter):
# Parse the file # Parse the file
soup = None soup = None
with open(local_path, "rt", encoding="utf-8") as fh: file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(fh.read(), "html.parser") soup = BeautifulSoup(file_obj.read(), "html.parser")
# Clean up some formatting # Clean up some formatting
for tptt in soup.find_all(class_="tptt"): for tptt in soup.find_all(class_="tptt"):

View File

@@ -16,3 +16,13 @@ class ConverterInput:
self.input_type = input_type self.input_type = input_type
self.filepath = filepath self.filepath = filepath
self.file_object = file_object self.file_object = file_object
def read_file(
self,
mode: str = 'rb',
encoding: Union[str, None] = None,
) -> Union[str, bytes, Any]:
if self.input_type == "object":
return self.file_object
return open(self.filepath, mode=mode, encoding=encoding)

View File

@@ -11,6 +11,7 @@ from azure.ai.documentintelligence.models import (
from azure.identity import DefaultAzureCredential from azure.identity import DefaultAzureCredential
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
@@ -39,7 +40,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
) )
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if extension is not supported by Document Intelligence # Bail if extension is not supported by Document Intelligence
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -59,9 +60,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
if extension.lower() not in docintel_extensions: if extension.lower() not in docintel_extensions:
return None return None
# Get the bytestring for the local path # Get the bytestring from the converter input
with open(local_path, "rb") as f: file_obj = input.read_file(mode='rb')
file_bytes = f.read() file_bytes = file_obj.read()
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx) # Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]: if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:

View File

@@ -8,6 +8,7 @@ from ._base import (
from ._base import DocumentConverter from ._base import DocumentConverter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ._converter_input import ConverterInput
class DocxConverter(HtmlConverter): class DocxConverter(HtmlConverter):
@@ -20,18 +21,17 @@ class DocxConverter(HtmlConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX # Bail if not a DOCX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".docx": if extension.lower() != ".docx":
return None return None
result = None result = None
with open(local_path, "rb") as docx_file: style_map = kwargs.get("style_map", None)
style_map = kwargs.get("style_map", None) file_obj = input.read_file(mode="rb")
result = mammoth.convert_to_html(file_obj, style_map=style_map)
result = mammoth.convert_to_html(docx_file, style_map=style_map) html_content = result.value
html_content = result.value result = self._convert(html_content)
result = self._convert(html_content)
return result return result

View File

@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
from ._converter_input import ConverterInput
class HtmlConverter(DocumentConverter): class HtmlConverter(DocumentConverter):
@@ -14,7 +15,7 @@ class HtmlConverter(DocumentConverter):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if not html # Bail if not html
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -22,8 +23,8 @@ class HtmlConverter(DocumentConverter):
return None return None
result = None result = None
with open(local_path, "rt", encoding="utf-8") as fh: file_obj = input.read_file(mode="rt", encoding="utf-8")
result = self._convert(fh.read()) result = self._convert(file_obj.read())
return result return result

View File

@@ -1,6 +1,7 @@
from typing import Union from typing import Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter from ._media_converter import MediaConverter
from ._converter_input import ConverterInput
class ImageConverter(MediaConverter): class ImageConverter(MediaConverter):
@@ -13,7 +14,7 @@ class ImageConverter(MediaConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image # Bail if not an image
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".jpg", ".jpeg", ".png"]: if extension.lower() not in [".jpg", ".jpeg", ".png"]:
@@ -21,8 +22,9 @@ class ImageConverter(MediaConverter):
md_content = "" md_content = ""
# Add metadata # Add metadata if a local path is provided
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if input.input_type == "filepath":
metadata = self._get_metadata(input.filepath, kwargs.get("exiftool_path"))
if metadata: if metadata:
for f in [ for f in [
@@ -47,7 +49,7 @@ class ImageConverter(MediaConverter):
md_content += ( md_content += (
"\n# Description:\n" "\n# Description:\n"
+ self._get_llm_description( + self._get_llm_description(
local_path, input,
extension, extension,
llm_client, llm_client,
llm_model, llm_model,
@@ -61,17 +63,17 @@ class ImageConverter(MediaConverter):
text_content=md_content, text_content=md_content,
) )
def _get_llm_description(self, local_path, extension, client, model, prompt=None): def _get_llm_description(self, input: ConverterInput, extension, client, model, prompt=None):
if prompt is None or prompt.strip() == "": if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image." prompt = "Write a detailed caption for this image."
data_uri = "" data_uri = ""
with open(local_path, "rb") as image_file: content_type, encoding = mimetypes.guess_type("_dummy" + extension)
content_type, encoding = mimetypes.guess_type("_dummy" + extension) if content_type is None:
if content_type is None: content_type = "image/jpeg"
content_type = "image/jpeg" image_file = input.read_file(mode="rb")
image_base64 = base64.b64encode(image_file.read()).decode("utf-8") image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}" data_uri = f"data:{content_type};base64,{image_base64}"
messages = [ messages = [
{ {

View File

@@ -7,6 +7,7 @@ from ._base import (
) )
from .._exceptions import FileConversionException from .._exceptions import FileConversionException
from ._converter_input import ConverterInput
class IpynbConverter(DocumentConverter): class IpynbConverter(DocumentConverter):
@@ -18,7 +19,7 @@ class IpynbConverter(DocumentConverter):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if not ipynb # Bail if not ipynb
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -27,9 +28,9 @@ class IpynbConverter(DocumentConverter):
# Parse and convert the notebook # Parse and convert the notebook
result = None result = None
with open(local_path, "rt", encoding="utf-8") as fh: file_obj = input.read_file(mode="rt", encoding="utf-8")
notebook_content = json.load(fh) notebook_content = json.load(file_obj)
result = self._convert(notebook_content) result = self._convert(notebook_content)
return result return result

View File

@@ -1,7 +1,9 @@
import pdfminer import pdfminer
import pdfminer.high_level import pdfminer.high_level
from typing import Union from typing import Union
from io import StringIO
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class PdfConverter(DocumentConverter): class PdfConverter(DocumentConverter):
@@ -14,13 +16,17 @@ class PdfConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF # Bail if not a PDF
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf": if extension.lower() != ".pdf":
return None return None
output = StringIO()
file_obj = input.read_file(mode="rb")
pdfminer.high_level.extract_text_to_fp(file_obj, output)
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=pdfminer.high_level.extract_text(local_path), text_content=output.getvalue(),
) )