From 52cbff061a0ba0d6343cc722e99f6fe22389dbb6 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Wed, 19 Feb 2025 11:48:00 -0500 Subject: [PATCH] begin refactoring converter classes --- .../markitdown/src/markitdown/__init__.py | 2 -- .../src/markitdown/converters/__init__.py | 2 ++ .../converters/_bing_serp_converter.py | 7 +++--- .../_converter_input.py} | 12 +++++++++- .../converters/_doc_intel_converter.py | 9 +++---- .../markitdown/converters/_docx_converter.py | 14 +++++------ .../markitdown/converters/_html_converter.py | 7 +++--- .../markitdown/converters/_image_converter.py | 24 ++++++++++--------- .../markitdown/converters/_ipynb_converter.py | 9 +++---- .../markitdown/converters/_pdf_converter.py | 10 ++++++-- 10 files changed, 59 insertions(+), 37 deletions(-) rename packages/markitdown/src/markitdown/{_input.py => converters/_converter_input.py} (60%) diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py index 53a4e5e..59d9750 100644 --- a/packages/markitdown/src/markitdown/__init__.py +++ b/packages/markitdown/src/markitdown/__init__.py @@ -10,7 +10,6 @@ from ._exceptions import ( FileConversionException, UnsupportedFormatException, ) -from ._input import ConverterInput from .converters import DocumentConverter, DocumentConverterResult __all__ = [ @@ -22,5 +21,4 @@ __all__ = [ "ConverterPrerequisiteException", "FileConversionException", "UnsupportedFormatException", - "ConverterInput", ] diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index 1e5afe4..9ada5f1 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -20,6 +20,7 @@ from ._mp3_converter import Mp3Converter from ._outlook_msg_converter import OutlookMsgConverter from ._zip_converter import ZipConverter from ._doc_intel_converter import DocumentIntelligenceConverter +from ._converter_input import ConverterInput __all__ = [ "DocumentConverter", @@ -42,4 +43,5 @@ __all__ = [ "OutlookMsgConverter", "ZipConverter", "DocumentIntelligenceConverter", + "ConverterInput", ] diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index d1b11a6..892f7e4 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -8,6 +8,7 @@ from bs4 import BeautifulSoup from ._base import DocumentConverter, DocumentConverterResult from ._markdownify import _CustomMarkdownify +from ._converter_input import ConverterInput class BingSerpConverter(DocumentConverter): @@ -21,7 +22,7 @@ class BingSerpConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a Bing SERP extension = kwargs.get("file_extension", "") if extension.lower() not in [".html", ".htm"]: @@ -36,8 +37,8 @@ class BingSerpConverter(DocumentConverter): # Parse the file soup = None - with open(local_path, "rt", encoding="utf-8") as fh: - soup = BeautifulSoup(fh.read(), "html.parser") + file_obj = input.read_file(mode="rt", encoding="utf-8") + soup = BeautifulSoup(file_obj.read(), "html.parser") # Clean up some formatting for tptt in soup.find_all(class_="tptt"): diff --git a/packages/markitdown/src/markitdown/_input.py b/packages/markitdown/src/markitdown/converters/_converter_input.py similarity index 60% rename from packages/markitdown/src/markitdown/_input.py rename to packages/markitdown/src/markitdown/converters/_converter_input.py index 858f3b1..ef55b36 100644 --- a/packages/markitdown/src/markitdown/_input.py +++ b/packages/markitdown/src/markitdown/converters/_converter_input.py @@ -15,4 +15,14 @@ class ConverterInput: self.input_type = input_type self.filepath = filepath - self.file_object = file_object \ No newline at end of file + self.file_object = file_object + + def read_file( + self, + mode: str = 'rb', + encoding: Union[str, None] = None, + ) -> Union[str, bytes, Any]: + if self.input_type == "object": + return self.file_object + + return open(self.filepath, mode=mode, encoding=encoding) \ No newline at end of file diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index a1eac06..fd30a74 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -11,6 +11,7 @@ from azure.ai.documentintelligence.models import ( from azure.identity import DefaultAzureCredential from ._base import DocumentConverter, DocumentConverterResult +from ._converter_input import ConverterInput # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. @@ -39,7 +40,7 @@ class DocumentIntelligenceConverter(DocumentConverter): ) def convert( - self, local_path: str, **kwargs: Any + self, input: ConverterInput, **kwargs: Any ) -> Union[None, DocumentConverterResult]: # Bail if extension is not supported by Document Intelligence extension = kwargs.get("file_extension", "") @@ -59,9 +60,9 @@ class DocumentIntelligenceConverter(DocumentConverter): if extension.lower() not in docintel_extensions: return None - # Get the bytestring for the local path - with open(local_path, "rb") as f: - file_bytes = f.read() + # Get the bytestring from the converter input + file_obj = input.read_file(mode='rb') + file_bytes = file_obj.read() # Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx) if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]: diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 8515f6d..c8f7c10 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -8,6 +8,7 @@ from ._base import ( from ._base import DocumentConverter from ._html_converter import HtmlConverter +from ._converter_input import ConverterInput class DocxConverter(HtmlConverter): @@ -20,18 +21,17 @@ class DocxConverter(HtmlConverter): ): super().__init__(priority=priority) - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a DOCX extension = kwargs.get("file_extension", "") if extension.lower() != ".docx": return None result = None - with open(local_path, "rb") as docx_file: - style_map = kwargs.get("style_map", None) - - result = mammoth.convert_to_html(docx_file, style_map=style_map) - html_content = result.value - result = self._convert(html_content) + style_map = kwargs.get("style_map", None) + file_obj = input.read_file(mode="rb") + result = mammoth.convert_to_html(file_obj, style_map=style_map) + html_content = result.value + result = self._convert(html_content) return result diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 68c2536..8ac882d 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -3,6 +3,7 @@ from bs4 import BeautifulSoup from ._base import DocumentConverter, DocumentConverterResult from ._markdownify import _CustomMarkdownify +from ._converter_input import ConverterInput class HtmlConverter(DocumentConverter): @@ -14,7 +15,7 @@ class HtmlConverter(DocumentConverter): super().__init__(priority=priority) def convert( - self, local_path: str, **kwargs: Any + self, input: ConverterInput, **kwargs: Any ) -> Union[None, DocumentConverterResult]: # Bail if not html extension = kwargs.get("file_extension", "") @@ -22,8 +23,8 @@ class HtmlConverter(DocumentConverter): return None result = None - with open(local_path, "rt", encoding="utf-8") as fh: - result = self._convert(fh.read()) + file_obj = input.read_file(mode="rt", encoding="utf-8") + result = self._convert(file_obj.read()) return result diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index a46b67c..1c1056a 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -1,6 +1,7 @@ from typing import Union from ._base import DocumentConverter, DocumentConverterResult from ._media_converter import MediaConverter +from ._converter_input import ConverterInput class ImageConverter(MediaConverter): @@ -13,7 +14,7 @@ class ImageConverter(MediaConverter): ): super().__init__(priority=priority) - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not an image extension = kwargs.get("file_extension", "") if extension.lower() not in [".jpg", ".jpeg", ".png"]: @@ -21,8 +22,9 @@ class ImageConverter(MediaConverter): md_content = "" - # Add metadata - metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) + # Add metadata if a local path is provided + if input.input_type == "filepath": + metadata = self._get_metadata(input.filepath, kwargs.get("exiftool_path")) if metadata: for f in [ @@ -47,7 +49,7 @@ class ImageConverter(MediaConverter): md_content += ( "\n# Description:\n" + self._get_llm_description( - local_path, + input, extension, llm_client, llm_model, @@ -61,17 +63,17 @@ class ImageConverter(MediaConverter): text_content=md_content, ) - def _get_llm_description(self, local_path, extension, client, model, prompt=None): + def _get_llm_description(self, input: ConverterInput, extension, client, model, prompt=None): if prompt is None or prompt.strip() == "": prompt = "Write a detailed caption for this image." data_uri = "" - with open(local_path, "rb") as image_file: - content_type, encoding = mimetypes.guess_type("_dummy" + extension) - if content_type is None: - content_type = "image/jpeg" - image_base64 = base64.b64encode(image_file.read()).decode("utf-8") - data_uri = f"data:{content_type};base64,{image_base64}" + content_type, encoding = mimetypes.guess_type("_dummy" + extension) + if content_type is None: + content_type = "image/jpeg" + image_file = input.read_file(mode="rb") + image_base64 = base64.b64encode(image_file.read()).decode("utf-8") + data_uri = f"data:{content_type};base64,{image_base64}" messages = [ { diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py index b487f41..aa3a887 100644 --- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py +++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py @@ -7,6 +7,7 @@ from ._base import ( ) from .._exceptions import FileConversionException +from ._converter_input import ConverterInput class IpynbConverter(DocumentConverter): @@ -18,7 +19,7 @@ class IpynbConverter(DocumentConverter): super().__init__(priority=priority) def convert( - self, local_path: str, **kwargs: Any + self, input: ConverterInput, **kwargs: Any ) -> Union[None, DocumentConverterResult]: # Bail if not ipynb extension = kwargs.get("file_extension", "") @@ -27,9 +28,9 @@ class IpynbConverter(DocumentConverter): # Parse and convert the notebook result = None - with open(local_path, "rt", encoding="utf-8") as fh: - notebook_content = json.load(fh) - result = self._convert(notebook_content) + file_obj = input.read_file(mode="rt", encoding="utf-8") + notebook_content = json.load(file_obj) + result = self._convert(notebook_content) return result diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 3a2b671..870d6bf 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,7 +1,9 @@ import pdfminer import pdfminer.high_level from typing import Union +from io import StringIO from ._base import DocumentConverter, DocumentConverterResult +from ._converter_input import ConverterInput class PdfConverter(DocumentConverter): @@ -14,13 +16,17 @@ class PdfConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a PDF extension = kwargs.get("file_extension", "") if extension.lower() != ".pdf": return None + output = StringIO() + file_obj = input.read_file(mode="rb") + pdfminer.high_level.extract_text_to_fp(file_obj, output) return DocumentConverterResult( title=None, - text_content=pdfminer.high_level.extract_text(local_path), + text_content=output.getvalue(), ) +