From 52cbff061a0ba0d6343cc722e99f6fe22389dbb6 Mon Sep 17 00:00:00 2001
From: Kenny Zhang <kzhang678@gmail.com>
Date: Wed, 19 Feb 2025 11:48:00 -0500
Subject: [PATCH] begin refactoring converter classes

---
 .../markitdown/src/markitdown/__init__.py     |  2 --
 .../src/markitdown/converters/__init__.py     |  2 ++
 .../converters/_bing_serp_converter.py        |  7 +++---
 .../_converter_input.py}                      | 12 +++++++++-
 .../converters/_doc_intel_converter.py        |  9 +++----
 .../markitdown/converters/_docx_converter.py  | 14 +++++------
 .../markitdown/converters/_html_converter.py  |  7 +++---
 .../markitdown/converters/_image_converter.py | 24 ++++++++++---------
 .../markitdown/converters/_ipynb_converter.py |  9 +++----
 .../markitdown/converters/_pdf_converter.py   | 10 ++++++--
 10 files changed, 59 insertions(+), 37 deletions(-)
 rename packages/markitdown/src/markitdown/{_input.py => converters/_converter_input.py} (60%)

diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py
index 53a4e5e..59d9750 100644
--- a/packages/markitdown/src/markitdown/__init__.py
+++ b/packages/markitdown/src/markitdown/__init__.py
@@ -10,7 +10,6 @@ from ._exceptions import (
     FileConversionException,
     UnsupportedFormatException,
 )
-from ._input import ConverterInput
 from .converters import DocumentConverter, DocumentConverterResult
 
 __all__ = [
@@ -22,5 +21,4 @@ __all__ = [
     "ConverterPrerequisiteException",
     "FileConversionException",
     "UnsupportedFormatException",
-    "ConverterInput",
 ]
diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
index 1e5afe4..9ada5f1 100644
--- a/packages/markitdown/src/markitdown/converters/__init__.py
+++ b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -20,6 +20,7 @@ from ._mp3_converter import Mp3Converter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
 from ._doc_intel_converter import DocumentIntelligenceConverter
+from ._converter_input import ConverterInput
 
 __all__ = [
     "DocumentConverter",
@@ -42,4 +43,5 @@ __all__ = [
     "OutlookMsgConverter",
     "ZipConverter",
     "DocumentIntelligenceConverter",
+    "ConverterInput",
 ]
diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
index d1b11a6..892f7e4 100644
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
 
 from ._base import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
+from ._converter_input import ConverterInput
 
 
 class BingSerpConverter(DocumentConverter):
@@ -21,7 +22,7 @@ class BingSerpConverter(DocumentConverter):
     ):
         super().__init__(priority=priority)
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a Bing SERP
         extension = kwargs.get("file_extension", "")
         if extension.lower() not in [".html", ".htm"]:
@@ -36,8 +37,8 @@ class BingSerpConverter(DocumentConverter):
 
         # Parse the file
         soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
+        soup = BeautifulSoup(file_obj.read(), "html.parser")
 
         # Clean up some formatting
         for tptt in soup.find_all(class_="tptt"):
diff --git a/packages/markitdown/src/markitdown/_input.py b/packages/markitdown/src/markitdown/converters/_converter_input.py
similarity index 60%
rename from packages/markitdown/src/markitdown/_input.py
rename to packages/markitdown/src/markitdown/converters/_converter_input.py
index 858f3b1..ef55b36 100644
--- a/packages/markitdown/src/markitdown/_input.py
+++ b/packages/markitdown/src/markitdown/converters/_converter_input.py
@@ -15,4 +15,14 @@ class ConverterInput:
         
         self.input_type = input_type
         self.filepath = filepath
-        self.file_object = file_object
\ No newline at end of file
+        self.file_object = file_object
+
+    def read_file(
+        self,
+        mode: str = 'rb',
+        encoding: Union[str, None] = None,
+    ) -> Union[str, bytes, Any]:
+        if self.input_type == "object":
+            return self.file_object
+        
+        return open(self.filepath, mode=mode, encoding=encoding)
\ No newline at end of file
diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
index a1eac06..fd30a74 100644
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -11,6 +11,7 @@ from azure.ai.documentintelligence.models import (
 from azure.identity import DefaultAzureCredential
 
 from ._base import DocumentConverter, DocumentConverterResult
+from ._converter_input import ConverterInput
 
 
 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
@@ -39,7 +40,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
         )
 
     def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
         # Bail if extension is not supported by Document Intelligence
         extension = kwargs.get("file_extension", "")
@@ -59,9 +60,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
         if extension.lower() not in docintel_extensions:
             return None
 
-        # Get the bytestring for the local path
-        with open(local_path, "rb") as f:
-            file_bytes = f.read()
+        # Get the bytestring from the converter input
+        file_obj = input.read_file(mode='rb')
+        file_bytes = file_obj.read()
 
         # Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
         if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 8515f6d..c8f7c10 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -8,6 +8,7 @@ from ._base import (
 
 from ._base import DocumentConverter
 from ._html_converter import HtmlConverter
+from ._converter_input import ConverterInput
 
 
 class DocxConverter(HtmlConverter):
@@ -20,18 +21,17 @@ class DocxConverter(HtmlConverter):
     ):
         super().__init__(priority=priority)
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a DOCX
         extension = kwargs.get("file_extension", "")
         if extension.lower() != ".docx":
             return None
 
         result = None
-        with open(local_path, "rb") as docx_file:
-            style_map = kwargs.get("style_map", None)
-
-            result = mammoth.convert_to_html(docx_file, style_map=style_map)
-            html_content = result.value
-            result = self._convert(html_content)
+        style_map = kwargs.get("style_map", None)
+        file_obj = input.read_file(mode="rb")
+        result = mammoth.convert_to_html(file_obj, style_map=style_map)
+        html_content = result.value
+        result = self._convert(html_content)
 
         return result
diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
index 68c2536..8ac882d 100644
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
 
 from ._base import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
+from ._converter_input import ConverterInput
 
 
 class HtmlConverter(DocumentConverter):
@@ -14,7 +15,7 @@ class HtmlConverter(DocumentConverter):
         super().__init__(priority=priority)
 
     def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
         # Bail if not html
         extension = kwargs.get("file_extension", "")
@@ -22,8 +23,8 @@ class HtmlConverter(DocumentConverter):
             return None
 
         result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            result = self._convert(fh.read())
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
+        result = self._convert(file_obj.read())
 
         return result
 
diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py
index a46b67c..1c1056a 100644
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@@ -1,6 +1,7 @@
 from typing import Union
 from ._base import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter
+from ._converter_input import ConverterInput
 
 
 class ImageConverter(MediaConverter):
@@ -13,7 +14,7 @@ class ImageConverter(MediaConverter):
     ):
         super().__init__(priority=priority)
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not an image
         extension = kwargs.get("file_extension", "")
         if extension.lower() not in [".jpg", ".jpeg", ".png"]:
@@ -21,8 +22,9 @@ class ImageConverter(MediaConverter):
 
         md_content = ""
 
-        # Add metadata
-        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
+        # Add metadata if a local path is provided
+        if input.input_type == "filepath":
+            metadata = self._get_metadata(input.filepath, kwargs.get("exiftool_path"))
 
         if metadata:
             for f in [
@@ -47,7 +49,7 @@ class ImageConverter(MediaConverter):
             md_content += (
                 "\n# Description:\n"
                 + self._get_llm_description(
-                    local_path,
+                    input,
                     extension,
                     llm_client,
                     llm_model,
@@ -61,17 +63,17 @@ class ImageConverter(MediaConverter):
             text_content=md_content,
         )
 
-    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
+    def _get_llm_description(self, input: ConverterInput, extension, client, model, prompt=None):
         if prompt is None or prompt.strip() == "":
             prompt = "Write a detailed caption for this image."
 
         data_uri = ""
-        with open(local_path, "rb") as image_file:
-            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
-            if content_type is None:
-                content_type = "image/jpeg"
-            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
-            data_uri = f"data:{content_type};base64,{image_base64}"
+        content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+        if content_type is None:
+            content_type = "image/jpeg"
+        image_file = input.read_file(mode="rb")
+        image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
+        data_uri = f"data:{content_type};base64,{image_base64}"
 
         messages = [
             {
diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
index b487f41..aa3a887 100644
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@@ -7,6 +7,7 @@ from ._base import (
 )
 
 from .._exceptions import FileConversionException
+from ._converter_input import ConverterInput
 
 
 class IpynbConverter(DocumentConverter):
@@ -18,7 +19,7 @@ class IpynbConverter(DocumentConverter):
         super().__init__(priority=priority)
 
     def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
         # Bail if not ipynb
         extension = kwargs.get("file_extension", "")
@@ -27,9 +28,9 @@ class IpynbConverter(DocumentConverter):
 
         # Parse and convert the notebook
         result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            notebook_content = json.load(fh)
-            result = self._convert(notebook_content)
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
+        notebook_content = json.load(file_obj)
+        result = self._convert(notebook_content)
 
         return result
 
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index 3a2b671..870d6bf 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,7 +1,9 @@
 import pdfminer
 import pdfminer.high_level
 from typing import Union
+from io import StringIO
 from ._base import DocumentConverter, DocumentConverterResult
+from ._converter_input import ConverterInput
 
 
 class PdfConverter(DocumentConverter):
@@ -14,13 +16,17 @@ class PdfConverter(DocumentConverter):
     ):
         super().__init__(priority=priority)
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a PDF
         extension = kwargs.get("file_extension", "")
         if extension.lower() != ".pdf":
             return None
 
+        output = StringIO()
+        file_obj = input.read_file(mode="rb")
+        pdfminer.high_level.extract_text_to_fp(file_obj, output)
         return DocumentConverterResult(
             title=None,
-            text_content=pdfminer.high_level.extract_text(local_path),
+            text_content=output.getvalue(),
         )
+