Merge pull request #3 from microsoft/add_cli

Added a simple CLI.
This commit is contained in:
afourney
2024-11-14 10:27:55 -08:00
committed by GitHub
3 changed files with 63 additions and 9 deletions

View File

@@ -49,6 +49,9 @@ Source = "https://github.com/microsoft/markitdown"
[tool.hatch.version] [tool.hatch.version]
path = "src/markitdown/__about__.py" path = "src/markitdown/__about__.py"
[project.scripts]
markitdown = "markitdown.__main__:main"
[tool.hatch.envs.types] [tool.hatch.envs.types]
extra-dependencies = [ extra-dependencies = [
"mypy>=1.0.0", "mypy>=1.0.0",

View File

@@ -0,0 +1,42 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
import sys
from ._markitdown import MarkItDown
def main():
if len(sys.argv) == 1:
markitdown = MarkItDown()
result = markitdown.convert_stream(sys.stdin.buffer)
print(result.text_content)
elif len(sys.argv) == 2:
markitdown = MarkItDown()
result = markitdown.convert(sys.argv[1])
print(result.text_content)
else:
sys.stderr.write(
"""
SYNTAX:
markitdown <OPTIONAL: FILENAME>
If FILENAME is empty, markitdown reads from stdin.
EXAMPLE:
markitdown example.pdf
OR
cat example.pdf | markitdown
OR
markitdown < example.pdf
""".strip()
+ "\n"
)
if __name__ == "__main__":
main()

View File

@@ -11,6 +11,7 @@ import shutil
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
import traceback
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
@@ -913,7 +914,9 @@ class MarkItDown:
# Get extension alternatives from the path and puremagic # Get extension alternatives from the path and puremagic
base, ext = os.path.splitext(path) base, ext = os.path.splitext(path)
self._append_ext(extensions, ext) self._append_ext(extensions, ext)
self._append_ext(extensions, self._guess_ext_magic(path))
for g in self._guess_ext_magic(path):
self._append_ext(extensions, g)
# Convert # Convert
return self._convert(path, extensions, **kwargs) return self._convert(path, extensions, **kwargs)
@@ -940,7 +943,8 @@ class MarkItDown:
fh.close() fh.close()
# Use puremagic to check for more extension options # Use puremagic to check for more extension options
self._append_ext(extensions, self._guess_ext_magic(temp_path)) for g in self._guess_ext_magic(temp_path):
self._append_ext(extensions, g)
# Convert # Convert
result = self._convert(temp_path, extensions, **kwargs) result = self._convert(temp_path, extensions, **kwargs)
@@ -1032,10 +1036,10 @@ class MarkItDown:
_kwargs["mlm_model"] = self._mlm_model _kwargs["mlm_model"] = self._mlm_model
# If we hit an error log it and keep trying # If we hit an error log it and keep trying
# try: try:
res = converter.convert(local_path, **_kwargs) res = converter.convert(local_path, **_kwargs)
# except Exception: except Exception:
# error_trace = ("\n\n" + traceback.format_exc()).strip() error_trace = ("\n\n" + traceback.format_exc()).strip()
if res is not None: if res is not None:
# Normalize the content # Normalize the content
@@ -1074,10 +1078,15 @@ class MarkItDown:
# Use puremagic to guess # Use puremagic to guess
try: try:
guesses = puremagic.magic_file(path) guesses = puremagic.magic_file(path)
if len(guesses) > 0: extensions = list()
ext = guesses[0].extension.strip() for g in guesses:
ext = g.extension.strip()
if len(ext) > 0: if len(ext) > 0:
return ext if not ext.startswith("."):
ext = "." + ext
if ext not in extensions:
extensions.append(ext)
return extensions
except FileNotFoundError: except FileNotFoundError:
pass pass
except IsADirectoryError: except IsADirectoryError: