Feature/ Add xls support (#169)

* add xlrd
* add xls converter with tests
This commit is contained in:
yeungadrian
2025-01-03 21:58:17 +00:00
committed by GitHub
parent d248621ba4
commit 08ed32869e
4 changed files with 39 additions and 1 deletions

View File

@@ -726,7 +726,31 @@ class XlsxConverter(HtmlConverter):
if extension.lower() != ".xlsx":
return None
sheets = pd.read_excel(local_path, sheet_name=None)
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += self._convert(html_content).text_content.strip() + "\n\n"
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)
class XlsConverter(HtmlConverter):
"""
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLS
extension = kwargs.get("file_extension", "")
if extension.lower() != ".xls":
return None
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
@@ -1353,6 +1377,7 @@ class MarkItDown:
self.register_page_converter(BingSerpConverter())
self.register_page_converter(DocxConverter())
self.register_page_converter(XlsxConverter())
self.register_page_converter(XlsConverter())
self.register_page_converter(PptxConverter())
self.register_page_converter(WavConverter())
self.register_page_converter(Mp3Converter())