2 Commits

Author SHA1 Message Date
Adam Fourney
326d17b802 Bump version. 2025-02-28 07:29:12 -08:00
Hieu Lam
519fe172aa Unable to convert HTML to Markdown (#1072)
* feat: issue where inherited function from `markdownify.MarkdownConverter` doesn't have `current_tags` leading to error using `kwargs`, also set default value for `convert_as_inline`
2025-02-28 00:57:41 -08:00
3 changed files with 20 additions and 5 deletions

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.0.1a4" __version__ = "0.0.1a5"

View File

@@ -84,7 +84,9 @@ def main():
) )
elif args.filename is None: elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.") raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(exiftool_path=which_exiftool, docintel_endpoint=args.endpoint) markitdown = MarkItDown(
exiftool_path=which_exiftool, docintel_endpoint=args.endpoint
)
else: else:
markitdown = MarkItDown(exiftool_path=which_exiftool) markitdown = MarkItDown(exiftool_path=which_exiftool)

View File

@@ -91,7 +91,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: def convert_hn(
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual, but be sure to start with a new line""" """Same as usual, but be sure to start with a new line"""
if not convert_as_inline: if not convert_as_inline:
if not re.search(r"^\n", text): if not re.search(r"^\n", text):
@@ -99,7 +106,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a(self, el: Any, text: str, convert_as_inline: bool): def convert_a(
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
):
"""Same as usual converter, but removes Javascript links and escapes URIs.""" """Same as usual converter, but removes Javascript links and escapes URIs."""
prefix, suffix, text = markdownify.chomp(text) # type: ignore prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text: if not text:
@@ -135,7 +144,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
else text else text
) )
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str: def convert_img(
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
) -> str:
"""Same as usual converter, but removes data URIs""" """Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or "" alt = el.attrs.get("alt", None) or ""
@@ -1752,6 +1763,8 @@ class MarkItDown:
ext = ext.strip() ext = ext.strip()
if ext == "": if ext == "":
return return
if ext in extensions:
return
# if ext not in extensions: # if ext not in extensions:
extensions.append(ext) extensions.append(ext)