begin refactoring converter classes
This commit is contained in:
@@ -10,7 +10,6 @@ from ._exceptions import (
|
|||||||
FileConversionException,
|
FileConversionException,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
)
|
)
|
||||||
from ._input import ConverterInput
|
|
||||||
from .converters import DocumentConverter, DocumentConverterResult
|
from .converters import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -22,5 +21,4 @@ __all__ = [
|
|||||||
"ConverterPrerequisiteException",
|
"ConverterPrerequisiteException",
|
||||||
"FileConversionException",
|
"FileConversionException",
|
||||||
"UnsupportedFormatException",
|
"UnsupportedFormatException",
|
||||||
"ConverterInput",
|
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ from ._mp3_converter import Mp3Converter
|
|||||||
from ._outlook_msg_converter import OutlookMsgConverter
|
from ._outlook_msg_converter import OutlookMsgConverter
|
||||||
from ._zip_converter import ZipConverter
|
from ._zip_converter import ZipConverter
|
||||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"DocumentConverter",
|
"DocumentConverter",
|
||||||
@@ -42,4 +43,5 @@ __all__ = [
|
|||||||
"OutlookMsgConverter",
|
"OutlookMsgConverter",
|
||||||
"ZipConverter",
|
"ZipConverter",
|
||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
|
"ConverterInput",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class BingSerpConverter(DocumentConverter):
|
class BingSerpConverter(DocumentConverter):
|
||||||
@@ -21,7 +22,7 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a Bing SERP
|
# Bail if not a Bing SERP
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
if extension.lower() not in [".html", ".htm"]:
|
||||||
@@ -36,8 +37,8 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Parse the file
|
# Parse the file
|
||||||
soup = None
|
soup = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||||
|
|
||||||
# Clean up some formatting
|
# Clean up some formatting
|
||||||
for tptt in soup.find_all(class_="tptt"):
|
for tptt in soup.find_all(class_="tptt"):
|
||||||
|
|||||||
@@ -16,3 +16,13 @@ class ConverterInput:
|
|||||||
self.input_type = input_type
|
self.input_type = input_type
|
||||||
self.filepath = filepath
|
self.filepath = filepath
|
||||||
self.file_object = file_object
|
self.file_object = file_object
|
||||||
|
|
||||||
|
def read_file(
|
||||||
|
self,
|
||||||
|
mode: str = 'rb',
|
||||||
|
encoding: Union[str, None] = None,
|
||||||
|
) -> Union[str, bytes, Any]:
|
||||||
|
if self.input_type == "object":
|
||||||
|
return self.file_object
|
||||||
|
|
||||||
|
return open(self.filepath, mode=mode, encoding=encoding)
|
||||||
@@ -11,6 +11,7 @@ from azure.ai.documentintelligence.models import (
|
|||||||
from azure.identity import DefaultAzureCredential
|
from azure.identity import DefaultAzureCredential
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||||
@@ -39,7 +40,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if extension is not supported by Document Intelligence
|
# Bail if extension is not supported by Document Intelligence
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -59,9 +60,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
if extension.lower() not in docintel_extensions:
|
if extension.lower() not in docintel_extensions:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Get the bytestring for the local path
|
# Get the bytestring from the converter input
|
||||||
with open(local_path, "rb") as f:
|
file_obj = input.read_file(mode='rb')
|
||||||
file_bytes = f.read()
|
file_bytes = file_obj.read()
|
||||||
|
|
||||||
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
|
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
|
||||||
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
|
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from ._base import (
|
|||||||
|
|
||||||
from ._base import DocumentConverter
|
from ._base import DocumentConverter
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class DocxConverter(HtmlConverter):
|
class DocxConverter(HtmlConverter):
|
||||||
@@ -20,18 +21,17 @@ class DocxConverter(HtmlConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a DOCX
|
# Bail if not a DOCX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".docx":
|
if extension.lower() != ".docx":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
result = None
|
result = None
|
||||||
with open(local_path, "rb") as docx_file:
|
style_map = kwargs.get("style_map", None)
|
||||||
style_map = kwargs.get("style_map", None)
|
file_obj = input.read_file(mode="rb")
|
||||||
|
result = mammoth.convert_to_html(file_obj, style_map=style_map)
|
||||||
result = mammoth.convert_to_html(docx_file, style_map=style_map)
|
html_content = result.value
|
||||||
html_content = result.value
|
result = self._convert(html_content)
|
||||||
result = self._convert(html_content)
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class HtmlConverter(DocumentConverter):
|
class HtmlConverter(DocumentConverter):
|
||||||
@@ -14,7 +15,7 @@ class HtmlConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not html
|
# Bail if not html
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -22,8 +23,8 @@ class HtmlConverter(DocumentConverter):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
result = None
|
result = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
result = self._convert(fh.read())
|
result = self._convert(file_obj.read())
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class ImageConverter(MediaConverter):
|
class ImageConverter(MediaConverter):
|
||||||
@@ -13,7 +14,7 @@ class ImageConverter(MediaConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not an image
|
# Bail if not an image
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
||||||
@@ -21,8 +22,9 @@ class ImageConverter(MediaConverter):
|
|||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
# Add metadata if a local path is provided
|
||||||
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
if input.input_type == "filepath":
|
||||||
|
metadata = self._get_metadata(input.filepath, kwargs.get("exiftool_path"))
|
||||||
|
|
||||||
if metadata:
|
if metadata:
|
||||||
for f in [
|
for f in [
|
||||||
@@ -47,7 +49,7 @@ class ImageConverter(MediaConverter):
|
|||||||
md_content += (
|
md_content += (
|
||||||
"\n# Description:\n"
|
"\n# Description:\n"
|
||||||
+ self._get_llm_description(
|
+ self._get_llm_description(
|
||||||
local_path,
|
input,
|
||||||
extension,
|
extension,
|
||||||
llm_client,
|
llm_client,
|
||||||
llm_model,
|
llm_model,
|
||||||
@@ -61,17 +63,17 @@ class ImageConverter(MediaConverter):
|
|||||||
text_content=md_content,
|
text_content=md_content,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
|
def _get_llm_description(self, input: ConverterInput, extension, client, model, prompt=None):
|
||||||
if prompt is None or prompt.strip() == "":
|
if prompt is None or prompt.strip() == "":
|
||||||
prompt = "Write a detailed caption for this image."
|
prompt = "Write a detailed caption for this image."
|
||||||
|
|
||||||
data_uri = ""
|
data_uri = ""
|
||||||
with open(local_path, "rb") as image_file:
|
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
||||||
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
if content_type is None:
|
||||||
if content_type is None:
|
content_type = "image/jpeg"
|
||||||
content_type = "image/jpeg"
|
image_file = input.read_file(mode="rb")
|
||||||
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
||||||
data_uri = f"data:{content_type};base64,{image_base64}"
|
data_uri = f"data:{content_type};base64,{image_base64}"
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from ._base import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from .._exceptions import FileConversionException
|
from .._exceptions import FileConversionException
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class IpynbConverter(DocumentConverter):
|
class IpynbConverter(DocumentConverter):
|
||||||
@@ -18,7 +19,7 @@ class IpynbConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not ipynb
|
# Bail if not ipynb
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -27,9 +28,9 @@ class IpynbConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Parse and convert the notebook
|
# Parse and convert the notebook
|
||||||
result = None
|
result = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
notebook_content = json.load(fh)
|
notebook_content = json.load(file_obj)
|
||||||
result = self._convert(notebook_content)
|
result = self._convert(notebook_content)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
import pdfminer
|
import pdfminer
|
||||||
import pdfminer.high_level
|
import pdfminer.high_level
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
from io import StringIO
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class PdfConverter(DocumentConverter):
|
class PdfConverter(DocumentConverter):
|
||||||
@@ -14,13 +16,17 @@ class PdfConverter(DocumentConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a PDF
|
# Bail if not a PDF
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".pdf":
|
if extension.lower() != ".pdf":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
output = StringIO()
|
||||||
|
file_obj = input.read_file(mode="rb")
|
||||||
|
pdfminer.high_level.extract_text_to_fp(file_obj, output)
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=pdfminer.high_level.extract_text(local_path),
|
text_content=output.getvalue(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user