refactored remaining converters
This commit is contained in:
@@ -21,7 +21,7 @@ class ConverterInput:
|
||||
self,
|
||||
mode: str = 'rb',
|
||||
encoding: Union[str, None] = None,
|
||||
) -> Union[str, bytes, Any]:
|
||||
) -> Any:
|
||||
if self.input_type == "object":
|
||||
return self.file_object
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
from ._converter_input import ConverterInput
|
||||
|
||||
|
||||
class WikipediaConverter(DocumentConverter):
|
||||
@@ -16,7 +17,7 @@ class WikipediaConverter(DocumentConverter):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
self, input: ConverterInput, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not Wikipedia
|
||||
extension = kwargs.get("file_extension", "")
|
||||
@@ -28,8 +29,8 @@ class WikipediaConverter(DocumentConverter):
|
||||
|
||||
# Parse the file
|
||||
soup = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||
|
||||
# Remove javascript and style blocks
|
||||
for script in soup(["script", "style"]):
|
||||
|
||||
@@ -4,6 +4,7 @@ import pandas as pd
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._converter_input import ConverterInput
|
||||
|
||||
|
||||
class XlsxConverter(HtmlConverter):
|
||||
@@ -16,13 +17,14 @@ class XlsxConverter(HtmlConverter):
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a XLSX
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".xlsx":
|
||||
return None
|
||||
|
||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
||||
file_obj = input.read_file(mode="rb")
|
||||
sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
@@ -40,13 +42,14 @@ class XlsConverter(HtmlConverter):
|
||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a XLS
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".xls":
|
||||
return None
|
||||
|
||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
||||
file_obj = input.read_file(mode="rb")
|
||||
sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
import re
|
||||
import json
|
||||
|
||||
from typing import Any, Union, Dict, List
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._converter_input import ConverterInput
|
||||
|
||||
|
||||
# Optional YouTube transcription support
|
||||
@@ -25,7 +27,7 @@ class YouTubeConverter(DocumentConverter):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
self, input: ConverterInput, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not YouTube
|
||||
extension = kwargs.get("file_extension", "")
|
||||
@@ -37,8 +39,8 @@ class YouTubeConverter(DocumentConverter):
|
||||
|
||||
# Parse the file
|
||||
soup = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||
|
||||
# Read the meta tags
|
||||
assert soup.title is not None and soup.title.string is not None
|
||||
|
||||
@@ -4,6 +4,7 @@ import shutil
|
||||
from typing import Any, Union
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._converter_input import ConverterInput
|
||||
|
||||
|
||||
class ZipConverter(DocumentConverter):
|
||||
@@ -51,12 +52,17 @@ class ZipConverter(DocumentConverter):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
self, input: ConverterInput, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a ZIP
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".zip":
|
||||
return None
|
||||
|
||||
# Bail if a local path is not provided
|
||||
if input.input_type != "filepath":
|
||||
return None
|
||||
local_path = input.filepath
|
||||
|
||||
# Get parent converters list if available
|
||||
parent_converters = kwargs.get("_parent_converters", [])
|
||||
@@ -110,8 +116,12 @@ class ZipConverter(DocumentConverter):
|
||||
# Skip the zip converter to avoid infinite recursion
|
||||
if isinstance(converter, ZipConverter):
|
||||
continue
|
||||
|
||||
result = converter.convert(file_path, **file_kwargs)
|
||||
|
||||
# Create a ConverterInput for the parent converter and attempt conversion
|
||||
input = ConverterInput(
|
||||
input_type="filepath", filepath=file_path
|
||||
)
|
||||
result = converter.convert(input, **file_kwargs)
|
||||
if result is not None:
|
||||
md_content += f"\n## File: {relative_path}\n\n"
|
||||
md_content += result.text_content + "\n\n"
|
||||
|
||||
Reference in New Issue
Block a user