refactored remaining converters

2025-02-19 14:01:43 -05:00
parent 096fef3d5f
commit 8e950325d2
5 changed files with 30 additions and 14 deletions
--- a/packages/markitdown/src/markitdown/converters/_converter_input.py
+++ b/packages/markitdown/src/markitdown/converters/_converter_input.py
@@ -21,7 +21,7 @@ class ConverterInput:
        self,
        mode: str = 'rb',
        encoding: Union[str, None] = None,
-    ) -> Union[str, bytes, Any]:
+    ) -> Any:
        if self.input_type == "object":
            return self.file_object
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
 from ._base import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
 from ._converter_input import ConverterInput
 class WikipediaConverter(DocumentConverter):
@@ -16,7 +17,7 @@ class WikipediaConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not Wikipedia
        extension = kwargs.get("file_extension", "")
@@ -28,8 +29,8 @@ class WikipediaConverter(DocumentConverter):
        # Parse the file
        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
-            soup = BeautifulSoup(fh.read(), "html.parser")
+        soup = BeautifulSoup(file_obj.read(), "html.parser")
        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -4,6 +4,7 @@ import pandas as pd
 from ._base import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
 from ._converter_input import ConverterInput
 class XlsxConverter(HtmlConverter):
@@ -16,13 +17,14 @@ class XlsxConverter(HtmlConverter):
    ):
        super().__init__(priority=priority)
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a XLSX
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".xlsx":
            return None
-        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+        file_obj = input.read_file(mode="rb")
        sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
        md_content = ""
        for s in sheets:
            md_content += f"## {s}\n"
@@ -40,13 +42,14 @@ class XlsConverter(HtmlConverter):
    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
    """
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a XLS
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".xls":
            return None
-        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
+        file_obj = input.read_file(mode="rb")
        sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
        md_content = ""
        for s in sheets:
            md_content += f"## {s}\n"
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -1,10 +1,12 @@
 import re
 import json
 from typing import Any, Union, Dict, List
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup
 from ._base import DocumentConverter, DocumentConverterResult
 from ._converter_input import ConverterInput
 # Optional YouTube transcription support
@@ -25,7 +27,7 @@ class YouTubeConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not YouTube
        extension = kwargs.get("file_extension", "")
@@ -37,8 +39,8 @@ class YouTubeConverter(DocumentConverter):
        # Parse the file
        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
-            soup = BeautifulSoup(fh.read(), "html.parser")
+        soup = BeautifulSoup(file_obj.read(), "html.parser")
        # Read the meta tags
        assert soup.title is not None and soup.title.string is not None
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@@ -4,6 +4,7 @@ import shutil
 from typing import Any, Union
 from ._base import DocumentConverter, DocumentConverterResult
 from ._converter_input import ConverterInput
 class ZipConverter(DocumentConverter):
@@ -51,12 +52,17 @@ class ZipConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a ZIP
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".zip":
            return None
        # Bail if a local path is not provided
        if input.input_type != "filepath":
            return None
        local_path = input.filepath
        # Get parent converters list if available
        parent_converters = kwargs.get("_parent_converters", [])
@@ -110,8 +116,12 @@ class ZipConverter(DocumentConverter):
                        # Skip the zip converter to avoid infinite recursion
                        if isinstance(converter, ZipConverter):
                            continue
-
+                        
-                        result = converter.convert(file_path, **file_kwargs)
+                        # Create a ConverterInput for the parent converter and attempt conversion
                        input = ConverterInput(
                            input_type="filepath", filepath=file_path
                        )
                        result = converter.convert(input, **file_kwargs)
                        if result is not None:
                            md_content += f"\n## File: {relative_path}\n\n"
                            md_content += result.text_content + "\n\n"