From 8e950325d2f6345b440affb8d146ed9e1265b6fc Mon Sep 17 00:00:00 2001
From: Kenny Zhang <kzhang678@gmail.com>
Date: Wed, 19 Feb 2025 14:01:43 -0500
Subject: [PATCH] refactored remaining converters

---
 .../markitdown/converters/_converter_input.py    |  2 +-
 .../converters/_wikipedia_converter.py           |  7 ++++---
 .../src/markitdown/converters/_xlsx_converter.py | 11 +++++++----
 .../markitdown/converters/_youtube_converter.py  |  8 +++++---
 .../src/markitdown/converters/_zip_converter.py  | 16 +++++++++++++---
 5 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_converter_input.py b/packages/markitdown/src/markitdown/converters/_converter_input.py
index ef55b36..e1a1024 100644
--- a/packages/markitdown/src/markitdown/converters/_converter_input.py
+++ b/packages/markitdown/src/markitdown/converters/_converter_input.py
@@ -21,7 +21,7 @@ class ConverterInput:
         self,
         mode: str = 'rb',
         encoding: Union[str, None] = None,
-    ) -> Union[str, bytes, Any]:
+    ) -> Any:
         if self.input_type == "object":
             return self.file_object
         
diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
index f27fe23..e3b98ca 100644
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
 
 from ._base import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
+from ._converter_input import ConverterInput
 
 
 class WikipediaConverter(DocumentConverter):
@@ -16,7 +17,7 @@ class WikipediaConverter(DocumentConverter):
         super().__init__(priority=priority)
 
     def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
         # Bail if not Wikipedia
         extension = kwargs.get("file_extension", "")
@@ -28,8 +29,8 @@ class WikipediaConverter(DocumentConverter):
 
         # Parse the file
         soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
+        soup = BeautifulSoup(file_obj.read(), "html.parser")
 
         # Remove javascript and style blocks
         for script in soup(["script", "style"]):
diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index 2bdfd5d..df80a47 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -4,6 +4,7 @@ import pandas as pd
 
 from ._base import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
+from ._converter_input import ConverterInput
 
 
 class XlsxConverter(HtmlConverter):
@@ -16,13 +17,14 @@ class XlsxConverter(HtmlConverter):
     ):
         super().__init__(priority=priority)
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a XLSX
         extension = kwargs.get("file_extension", "")
         if extension.lower() != ".xlsx":
             return None
 
-        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+        file_obj = input.read_file(mode="rb")
+        sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
         md_content = ""
         for s in sheets:
             md_content += f"## {s}\n"
@@ -40,13 +42,14 @@ class XlsConverter(HtmlConverter):
     Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
     """
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a XLS
         extension = kwargs.get("file_extension", "")
         if extension.lower() != ".xls":
             return None
 
-        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
+        file_obj = input.read_file(mode="rb")
+        sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
         md_content = ""
         for s in sheets:
             md_content += f"## {s}\n"
diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
index b961b88..9bcc2b0 100644
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -1,10 +1,12 @@
 import re
+import json
 
 from typing import Any, Union, Dict, List
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup
 
 from ._base import DocumentConverter, DocumentConverterResult
+from ._converter_input import ConverterInput
 
 
 # Optional YouTube transcription support
@@ -25,7 +27,7 @@ class YouTubeConverter(DocumentConverter):
         super().__init__(priority=priority)
 
     def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
         # Bail if not YouTube
         extension = kwargs.get("file_extension", "")
@@ -37,8 +39,8 @@ class YouTubeConverter(DocumentConverter):
 
         # Parse the file
         soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
+        soup = BeautifulSoup(file_obj.read(), "html.parser")
 
         # Read the meta tags
         assert soup.title is not None and soup.title.string is not None
diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py
index 026900d..c302b73 100644
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@@ -4,6 +4,7 @@ import shutil
 from typing import Any, Union
 
 from ._base import DocumentConverter, DocumentConverterResult
+from ._converter_input import ConverterInput
 
 
 class ZipConverter(DocumentConverter):
@@ -51,12 +52,17 @@ class ZipConverter(DocumentConverter):
         super().__init__(priority=priority)
 
     def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
         # Bail if not a ZIP
         extension = kwargs.get("file_extension", "")
         if extension.lower() != ".zip":
             return None
+        
+        # Bail if a local path is not provided
+        if input.input_type != "filepath":
+            return None
+        local_path = input.filepath
 
         # Get parent converters list if available
         parent_converters = kwargs.get("_parent_converters", [])
@@ -110,8 +116,12 @@ class ZipConverter(DocumentConverter):
                         # Skip the zip converter to avoid infinite recursion
                         if isinstance(converter, ZipConverter):
                             continue
-
-                        result = converter.convert(file_path, **file_kwargs)
+                        
+                        # Create a ConverterInput for the parent converter and attempt conversion
+                        input = ConverterInput(
+                            input_type="filepath", filepath=file_path
+                        )
+                        result = converter.convert(input, **file_kwargs)
                         if result is not None:
                             md_content += f"\n## File: {relative_path}\n\n"
                             md_content += result.text_content + "\n\n"