refactored remaining converters
This commit is contained in:
@@ -21,7 +21,7 @@ class ConverterInput:
|
|||||||
self,
|
self,
|
||||||
mode: str = 'rb',
|
mode: str = 'rb',
|
||||||
encoding: Union[str, None] = None,
|
encoding: Union[str, None] = None,
|
||||||
) -> Union[str, bytes, Any]:
|
) -> Any:
|
||||||
if self.input_type == "object":
|
if self.input_type == "object":
|
||||||
return self.file_object
|
return self.file_object
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class WikipediaConverter(DocumentConverter):
|
class WikipediaConverter(DocumentConverter):
|
||||||
@@ -16,7 +17,7 @@ class WikipediaConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not Wikipedia
|
# Bail if not Wikipedia
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -28,8 +29,8 @@ class WikipediaConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Parse the file
|
# Parse the file
|
||||||
soup = None
|
soup = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import pandas as pd
|
|||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class XlsxConverter(HtmlConverter):
|
class XlsxConverter(HtmlConverter):
|
||||||
@@ -16,13 +17,14 @@ class XlsxConverter(HtmlConverter):
|
|||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a XLSX
|
# Bail if not a XLSX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".xlsx":
|
if extension.lower() != ".xlsx":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
file_obj = input.read_file(mode="rb")
|
||||||
|
sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
@@ -40,13 +42,14 @@ class XlsConverter(HtmlConverter):
|
|||||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, input: ConverterInput, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a XLS
|
# Bail if not a XLS
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".xls":
|
if extension.lower() != ".xls":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
file_obj = input.read_file(mode="rb")
|
||||||
|
sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
import re
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
from typing import Any, Union, Dict, List
|
from typing import Any, Union, Dict, List
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
# Optional YouTube transcription support
|
# Optional YouTube transcription support
|
||||||
@@ -25,7 +27,7 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not YouTube
|
# Bail if not YouTube
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
@@ -37,8 +39,8 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Parse the file
|
# Parse the file
|
||||||
soup = None
|
soup = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||||
|
|
||||||
# Read the meta tags
|
# Read the meta tags
|
||||||
assert soup.title is not None and soup.title.string is not None
|
assert soup.title is not None and soup.title.string is not None
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import shutil
|
|||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._converter_input import ConverterInput
|
||||||
|
|
||||||
|
|
||||||
class ZipConverter(DocumentConverter):
|
class ZipConverter(DocumentConverter):
|
||||||
@@ -51,12 +52,17 @@ class ZipConverter(DocumentConverter):
|
|||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a ZIP
|
# Bail if not a ZIP
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".zip":
|
if extension.lower() != ".zip":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Bail if a local path is not provided
|
||||||
|
if input.input_type != "filepath":
|
||||||
|
return None
|
||||||
|
local_path = input.filepath
|
||||||
|
|
||||||
# Get parent converters list if available
|
# Get parent converters list if available
|
||||||
parent_converters = kwargs.get("_parent_converters", [])
|
parent_converters = kwargs.get("_parent_converters", [])
|
||||||
@@ -110,8 +116,12 @@ class ZipConverter(DocumentConverter):
|
|||||||
# Skip the zip converter to avoid infinite recursion
|
# Skip the zip converter to avoid infinite recursion
|
||||||
if isinstance(converter, ZipConverter):
|
if isinstance(converter, ZipConverter):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
result = converter.convert(file_path, **file_kwargs)
|
# Create a ConverterInput for the parent converter and attempt conversion
|
||||||
|
input = ConverterInput(
|
||||||
|
input_type="filepath", filepath=file_path
|
||||||
|
)
|
||||||
|
result = converter.convert(input, **file_kwargs)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
md_content += f"\n## File: {relative_path}\n\n"
|
md_content += f"\n## File: {relative_path}\n\n"
|
||||||
md_content += result.text_content + "\n\n"
|
md_content += result.text_content + "\n\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user