Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
326d17b802 | ||
|
|
519fe172aa | ||
|
|
abe9752438 |
@@ -1,4 +1,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.0.1a3"
|
__version__ = "0.0.1a5"
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
|
import shutil
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from .__about__ import __version__
|
from .__about__ import __version__
|
||||||
from ._markitdown import MarkItDown, DocumentConverterResult
|
from ._markitdown import MarkItDown, DocumentConverterResult
|
||||||
@@ -74,6 +75,8 @@ def main():
|
|||||||
parser.add_argument("filename", nargs="?")
|
parser.add_argument("filename", nargs="?")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
which_exiftool = shutil.which("exiftool")
|
||||||
|
|
||||||
if args.use_docintel:
|
if args.use_docintel:
|
||||||
if args.endpoint is None:
|
if args.endpoint is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -81,9 +84,11 @@ def main():
|
|||||||
)
|
)
|
||||||
elif args.filename is None:
|
elif args.filename is None:
|
||||||
raise ValueError("Filename is required when using Document Intelligence.")
|
raise ValueError("Filename is required when using Document Intelligence.")
|
||||||
markitdown = MarkItDown(docintel_endpoint=args.endpoint)
|
markitdown = MarkItDown(
|
||||||
|
exiftool_path=which_exiftool, docintel_endpoint=args.endpoint
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
||||||
|
|
||||||
if args.filename is None:
|
if args.filename is None:
|
||||||
result = markitdown.convert_stream(sys.stdin.buffer)
|
result = markitdown.convert_stream(sys.stdin.buffer)
|
||||||
|
|||||||
@@ -91,7 +91,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
# Explicitly cast options to the expected type if necessary
|
# Explicitly cast options to the expected type if necessary
|
||||||
super().__init__(**options)
|
super().__init__(**options)
|
||||||
|
|
||||||
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
|
def convert_hn(
|
||||||
|
self,
|
||||||
|
n: int,
|
||||||
|
el: Any,
|
||||||
|
text: str,
|
||||||
|
convert_as_inline: Optional[bool] = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> str:
|
||||||
"""Same as usual, but be sure to start with a new line"""
|
"""Same as usual, but be sure to start with a new line"""
|
||||||
if not convert_as_inline:
|
if not convert_as_inline:
|
||||||
if not re.search(r"^\n", text):
|
if not re.search(r"^\n", text):
|
||||||
@@ -99,7 +106,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
|
|
||||||
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
||||||
|
|
||||||
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
|
def convert_a(
|
||||||
|
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
|
||||||
|
):
|
||||||
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
||||||
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
||||||
if not text:
|
if not text:
|
||||||
@@ -135,7 +144,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
else text
|
else text
|
||||||
)
|
)
|
||||||
|
|
||||||
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
|
def convert_img(
|
||||||
|
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
|
||||||
|
) -> str:
|
||||||
"""Same as usual converter, but removes data URIs"""
|
"""Same as usual converter, but removes data URIs"""
|
||||||
|
|
||||||
alt = el.attrs.get("alt", None) or ""
|
alt = el.attrs.get("alt", None) or ""
|
||||||
@@ -1752,6 +1763,8 @@ class MarkItDown:
|
|||||||
ext = ext.strip()
|
ext = ext.strip()
|
||||||
if ext == "":
|
if ext == "":
|
||||||
return
|
return
|
||||||
|
if ext in extensions:
|
||||||
|
return
|
||||||
# if ext not in extensions:
|
# if ext not in extensions:
|
||||||
extensions.append(ext)
|
extensions.append(ext)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user