refactored remaining converters

This commit is contained in:
Kenny Zhang
2025-02-19 14:01:43 -05:00
parent 096fef3d5f
commit 8e950325d2
5 changed files with 30 additions and 14 deletions

View File

@@ -21,7 +21,7 @@ class ConverterInput:
self,
mode: str = 'rb',
encoding: Union[str, None] = None,
) -> Union[str, bytes, Any]:
) -> Any:
if self.input_type == "object":
return self.file_object

View File

@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify
from ._converter_input import ConverterInput
class WikipediaConverter(DocumentConverter):
@@ -16,7 +17,7 @@ class WikipediaConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not Wikipedia
extension = kwargs.get("file_extension", "")
@@ -28,8 +29,8 @@ class WikipediaConverter(DocumentConverter):
# Parse the file
soup = None
with open(local_path, "rt", encoding="utf-8") as fh:
soup = BeautifulSoup(fh.read(), "html.parser")
file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(file_obj.read(), "html.parser")
# Remove javascript and style blocks
for script in soup(["script", "style"]):

View File

@@ -4,6 +4,7 @@ import pandas as pd
from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter
from ._converter_input import ConverterInput
class XlsxConverter(HtmlConverter):
@@ -16,13 +17,14 @@ class XlsxConverter(HtmlConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".xlsx":
return None
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
file_obj = input.read_file(mode="rb")
sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
@@ -40,13 +42,14 @@ class XlsConverter(HtmlConverter):
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLS
extension = kwargs.get("file_extension", "")
if extension.lower() != ".xls":
return None
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
file_obj = input.read_file(mode="rb")
sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
md_content = ""
for s in sheets:
md_content += f"## {s}\n"

View File

@@ -1,10 +1,12 @@
import re
import json
from typing import Any, Union, Dict, List
from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
# Optional YouTube transcription support
@@ -25,7 +27,7 @@ class YouTubeConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not YouTube
extension = kwargs.get("file_extension", "")
@@ -37,8 +39,8 @@ class YouTubeConverter(DocumentConverter):
# Parse the file
soup = None
with open(local_path, "rt", encoding="utf-8") as fh:
soup = BeautifulSoup(fh.read(), "html.parser")
file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(file_obj.read(), "html.parser")
# Read the meta tags
assert soup.title is not None and soup.title.string is not None

View File

@@ -4,6 +4,7 @@ import shutil
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class ZipConverter(DocumentConverter):
@@ -51,12 +52,17 @@ class ZipConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a ZIP
extension = kwargs.get("file_extension", "")
if extension.lower() != ".zip":
return None
# Bail if a local path is not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
# Get parent converters list if available
parent_converters = kwargs.get("_parent_converters", [])
@@ -110,8 +116,12 @@ class ZipConverter(DocumentConverter):
# Skip the zip converter to avoid infinite recursion
if isinstance(converter, ZipConverter):
continue
result = converter.convert(file_path, **file_kwargs)
# Create a ConverterInput for the parent converter and attempt conversion
input = ConverterInput(
input_type="filepath", filepath=file_path
)
result = converter.convert(input, **file_kwargs)
if result is not None:
md_content += f"\n## File: {relative_path}\n\n"
md_content += result.text_content + "\n\n"