From 8e950325d2f6345b440affb8d146ed9e1265b6fc Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Wed, 19 Feb 2025 14:01:43 -0500 Subject: [PATCH] refactored remaining converters --- .../markitdown/converters/_converter_input.py | 2 +- .../converters/_wikipedia_converter.py | 7 ++++--- .../src/markitdown/converters/_xlsx_converter.py | 11 +++++++---- .../markitdown/converters/_youtube_converter.py | 8 +++++--- .../src/markitdown/converters/_zip_converter.py | 16 +++++++++++++--- 5 files changed, 30 insertions(+), 14 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_converter_input.py b/packages/markitdown/src/markitdown/converters/_converter_input.py index ef55b36..e1a1024 100644 --- a/packages/markitdown/src/markitdown/converters/_converter_input.py +++ b/packages/markitdown/src/markitdown/converters/_converter_input.py @@ -21,7 +21,7 @@ class ConverterInput: self, mode: str = 'rb', encoding: Union[str, None] = None, - ) -> Union[str, bytes, Any]: + ) -> Any: if self.input_type == "object": return self.file_object diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index f27fe23..e3b98ca 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -5,6 +5,7 @@ from bs4 import BeautifulSoup from ._base import DocumentConverter, DocumentConverterResult from ._markdownify import _CustomMarkdownify +from ._converter_input import ConverterInput class WikipediaConverter(DocumentConverter): @@ -16,7 +17,7 @@ class WikipediaConverter(DocumentConverter): super().__init__(priority=priority) def convert( - self, local_path: str, **kwargs: Any + self, input: ConverterInput, **kwargs: Any ) -> Union[None, DocumentConverterResult]: # Bail if not Wikipedia extension = kwargs.get("file_extension", "") @@ -28,8 +29,8 @@ class WikipediaConverter(DocumentConverter): # Parse the file soup = None - with open(local_path, "rt", encoding="utf-8") as fh: - soup = BeautifulSoup(fh.read(), "html.parser") + file_obj = input.read_file(mode="rt", encoding="utf-8") + soup = BeautifulSoup(file_obj.read(), "html.parser") # Remove javascript and style blocks for script in soup(["script", "style"]): diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 2bdfd5d..df80a47 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -4,6 +4,7 @@ import pandas as pd from ._base import DocumentConverter, DocumentConverterResult from ._html_converter import HtmlConverter +from ._converter_input import ConverterInput class XlsxConverter(HtmlConverter): @@ -16,13 +17,14 @@ class XlsxConverter(HtmlConverter): ): super().__init__(priority=priority) - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") if extension.lower() != ".xlsx": return None - sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") + file_obj = input.read_file(mode="rb") + sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl") md_content = "" for s in sheets: md_content += f"## {s}\n" @@ -40,13 +42,14 @@ class XlsConverter(HtmlConverter): Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. """ - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a XLS extension = kwargs.get("file_extension", "") if extension.lower() != ".xls": return None - sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") + file_obj = input.read_file(mode="rb") + sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd") md_content = "" for s in sheets: md_content += f"## {s}\n" diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index b961b88..9bcc2b0 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -1,10 +1,12 @@ import re +import json from typing import Any, Union, Dict, List from urllib.parse import parse_qs, urlparse from bs4 import BeautifulSoup from ._base import DocumentConverter, DocumentConverterResult +from ._converter_input import ConverterInput # Optional YouTube transcription support @@ -25,7 +27,7 @@ class YouTubeConverter(DocumentConverter): super().__init__(priority=priority) def convert( - self, local_path: str, **kwargs: Any + self, input: ConverterInput, **kwargs: Any ) -> Union[None, DocumentConverterResult]: # Bail if not YouTube extension = kwargs.get("file_extension", "") @@ -37,8 +39,8 @@ class YouTubeConverter(DocumentConverter): # Parse the file soup = None - with open(local_path, "rt", encoding="utf-8") as fh: - soup = BeautifulSoup(fh.read(), "html.parser") + file_obj = input.read_file(mode="rt", encoding="utf-8") + soup = BeautifulSoup(file_obj.read(), "html.parser") # Read the meta tags assert soup.title is not None and soup.title.string is not None diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index 026900d..c302b73 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -4,6 +4,7 @@ import shutil from typing import Any, Union from ._base import DocumentConverter, DocumentConverterResult +from ._converter_input import ConverterInput class ZipConverter(DocumentConverter): @@ -51,12 +52,17 @@ class ZipConverter(DocumentConverter): super().__init__(priority=priority) def convert( - self, local_path: str, **kwargs: Any + self, input: ConverterInput, **kwargs: Any ) -> Union[None, DocumentConverterResult]: # Bail if not a ZIP extension = kwargs.get("file_extension", "") if extension.lower() != ".zip": return None + + # Bail if a local path is not provided + if input.input_type != "filepath": + return None + local_path = input.filepath # Get parent converters list if available parent_converters = kwargs.get("_parent_converters", []) @@ -110,8 +116,12 @@ class ZipConverter(DocumentConverter): # Skip the zip converter to avoid infinite recursion if isinstance(converter, ZipConverter): continue - - result = converter.convert(file_path, **file_kwargs) + + # Create a ConverterInput for the parent converter and attempt conversion + input = ConverterInput( + input_type="filepath", filepath=file_path + ) + result = converter.convert(input, **file_kwargs) if result is not None: md_content += f"\n## File: {relative_path}\n\n" md_content += result.text_content + "\n\n"